Merge remote-tracking branch 'upstream/master' into revise_negative

2021-06-09 13:37:11 +02:00 · 2021-06-09 13:37:11 +02:00 · 3c213af7cd
commit 3c213af7cd
parent 482421c0bb 8c74a0a52c
221 changed files with 7172 additions and 1925 deletions
--- a/.ci/openvino-onnx/Jenkinsfile
+++ b/.ci/openvino-onnx/Jenkinsfile
@ -113,8 +113,8 @@ def buildDockerImage(Map configuration, String workdir) {
        --build-arg BUILD_TYPE=${configuration.build_type} \
        --build-arg PROTOBUF_LITE=${configuration.protobuf_lite} \
        --file=.ci/openvino-onnx/Dockerfile \
-        --build-arg http_proxy=http://proxy-chain.intel.com:911/ \
-        --build-arg https_proxy=http://proxy-chain.intel.com:912/ .
+        --build-arg http_proxy=http://proxy-ir.intel.com:911/ \
+        --build-arg https_proxy=http://proxy-ir.intel.com:911/ .
    """
 }

--- a/.github/org_control/check_pr.py
+++ b/.github/org_control/check_pr.py
@ -139,14 +139,15 @@ def update_labels(gh_api, pull, non_org_intel_pr_users, non_org_pr_users):

 def get_wrong_commits(pull):
    """Returns commits with incorrect user and email"""
-    print("GitHub PR user email:", pull.user.email)
+    pr_author_email = pull.user.email.lower()
+    print("GitHub PR author email:", pr_author_email)
    print("Check commits:")
    wrong_commits = set()
    for commit in pull.get_commits():
        # import pprint; pprint.pprint(commit.raw_data)
        print("Commit SHA:", commit.sha)
        # Use raw data because commit author can be non GitHub user
-        commit_email = commit.raw_data["commit"]["author"]["email"]
+        commit_email = commit.raw_data["commit"]["author"]["email"].lower()
        print("    Commit email:", commit_email)
        if not github_api.is_valid_user(commit.author):
            print(
@ -159,9 +160,8 @@ def get_wrong_commits(pull):
                "    WARNING: The commit is not verified. Reason:",
                commit.raw_data["commit"]["verification"]["reason"],
            )
-            if pull.user.email != commit_email:
-                print("    ERROR: Commit email and GitHub user public email are differnt")
-                wrong_commits.add(commit.sha)
+            if pr_author_email != commit_email:
+                print("    WARNING: Commit email and GitHub PR author public email are differnt")
    return wrong_commits


@ -229,7 +229,7 @@ def main():
    if wrong_pulls:
        for pull_number, wrong_commits in wrong_pulls.items():
            print(
-                f"\nERROR: Remove or replace wrong commits in the PR {pull_number}:\n    ",
+                f"\nERROR: Remove or replace wrong commits in the PR {pull_number}:\n   ",
                "\n    ".join(wrong_commits),
            )
        print(
--- a/.github/workflows/code_style.yml
+++ b/.github/workflows/code_style.yml
@ -15,14 +15,17 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt --assume-yes install libusb-1.0-0-dev
+          python3 -m pip install --upgrade pip
          python3 -m pip install -r ./inference-engine/ie_bridges/python/requirements.txt
+          # Add for -DENABLE_PYTHON=ON, no cython
+          python3 -m pip install -r ./inference-engine/ie_bridges/python/src/requirements-dev.txt

      # Run cmake with -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT in order to enable codestyle check for ITT collector
      - name: CMake
        run: |
          mkdir build
          cd build
-          cmake -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT ..
+          cmake -DENABLE_PYTHON=ON -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT ..

      - name: Check code style
        run: cmake --build build --target clang_format_check_all
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@ -6,6 +6,8 @@ ie_dependent_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON "X8

 ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF)

+ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" OFF)
+
 ie_dependent_option (ENABLE_CLDNN "clDnn based plugin for inference engine" ON "X86_64;NOT APPLE;NOT MINGW;NOT WINDOWS_STORE;NOT WINDOWS_PHONE" OFF)

 ie_option (ENABLE_PROFILING_ITT "Build with ITT tracing. Optionally configure pre-built ittnotify library though INTEL_VTUNE_DIR variable." OFF)
@ -18,8 +20,6 @@ Supported values:\

 ie_option (ENABLE_PROFILING_FIRST_INFERENCE "Build with ITT tracing of first inference time." ON)

-ie_option (ENABLE_DOCS "Build docs using Doxygen" OFF)
-
 ie_option(ENABLE_TEMPLATE_PLUGIN "Register template plugin into plugins.xml" OFF)

 ie_option_enum(SELECTIVE_BUILD "Enable OpenVINO conditional compilation or statistics collection. \
@ -33,6 +33,9 @@ ie_option(ENABLE_ERROR_HIGHLIGHT "Highlight errors and warnings during compile t
 find_package(PythonLibs 3 QUIET)
 ie_dependent_option (ENABLE_PYTHON "enables ie python bridge build" OFF "PYTHONLIBS_FOUND" OFF)

+find_package(PythonInterp 3 QUIET)
+ie_dependent_option (ENABLE_DOCS "Build docs using Doxygen" OFF "PYTHONINTERP_FOUND" OFF)
+
 #
 # enable or disable output from NGRAPH_DEBUG statements
 #
--- a/docs/IE_DG/API_Changes.md
+++ b/docs/IE_DG/API_Changes.md
@ -14,6 +14,15 @@ The sections below contain detailed list of changes made to the Inference Engine
 * InferenceEngine::Parameter(std::shared_ptr<ngraph::Variant>& var)
 * std::shared_ptr<ngraph::Variant> InferenceEngine::Parameter::asVariant() const
 * InferenceEngine::Parameter::operator std::shared_ptr<ngraph::Variant>() const
+ * KEY_CLDNN_NV12_TWO_INPUTS GPU plugin option. Use KEY_GPU_NV12_TWO_INPUTS instead
+ * KEY_CLDNN_PLUGIN_PRIORITY GPU plugin option. Use KEY_GPU_PLUGIN_PRIORITY instead
+ * KEY_CLDNN_PLUGIN_THROTTLE GPU plugin option. Use KEY_GPU_PLUGIN_THROTTLE instead
+ * KEY_CLDNN_MEM_POOL GPU plugin option
+ * KEY_CLDNN_GRAPH_DUMPS_DIR GPU plugin option
+ * KEY_CLDNN_SOURCES_DUMPS_DIR GPU plugin option
+ * KEY_DUMP_KERNELS GPU plugin option
+ * KEY_TUNING_MODE GPU plugin option
+ * KEY_TUNING_FILE GPU plugin option

 ## 2021.3

--- a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md
+++ b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md
@ -219,22 +219,6 @@ __kernel void example_relu_kernel(

 ## Debugging Tips<a name="debugging-tips"></a>

-* **Dumping the Resulting Kernels**.
-It is recommended to get a dump of the kernel with all of
-the values set by the Inference Engine, such as tensor sizes,
-floating-point, and integer kernel parameters. To get the dump, add the
-following line to your code that configures the GPU plugin to output the
-custom kernels:
-
-@snippet snippets/GPU_Kernel.cpp part1
-
-When the Inference Engine compiles the kernels for the specific network,
-it also outputs the resulting code for the custom kernels. In the
-directory of your executable, find files like
-`clDNN_program0.cl`, `clDNN_program1.cl`. There are as many files as
-distinct sets of parameters for your custom kernel: different input
-tensor sizes and kernel parameters.
-
 * **Using `printf` in the OpenCL™ Kernels**.
 To debug the specific values, you can use `printf` in your kernels.
 However, be careful: for instance, do not output excessively
--- a/docs/IE_DG/GPU_Kernels_Tuning.md
+++ b/docs/IE_DG/GPU_Kernels_Tuning.md
@ -1,39 +0,0 @@
-Using GPU Kernels Tuning {#openvino_docs_IE_DG_GPU_Kernels_Tuning}
-======================
-
-GPU Kernels Tuning allows you to tune models, so the heavy computational layers are configured to fit better into
-hardware, which the tuning was done on. It is required to achieve best performance on GPU.
-> **NOTE** Currently only convolution and fully connected layers undergo tuning process. It means that the performance boost depends on the amount of that layers in the model.
-
-OpenVINO™ releases include the `<INSTALL_DIR>/inference_engine/bin/intel64/Release/cache.json` file with pretuned data for current state of the art models. It is highly recommended to do the
-tuning for new kind of models, hardwares or drivers.
-
-## Tuned data
-
-GPU tuning data is saved in JSON format. The file is composed of 2 types of attributes and 1 type of value:
-* Execution units number (attribute): splits the content into different EU sections
-* Hash (attribute): hashed tuned kernel data
-* Key (value): Array with kernel name and kernel's mode index
-
-## Usage
-
---
-
-You can activate Kernels Tuning process by setting `KEY_TUNING_MODE` flag to `TUNING_CREATE` and `KEY_TUNING_FILE` to `<"filename">` in a configuration map that is
-passed to the plugin while loading a network.
-This configuration modifies the behavior of the `ExecutableNetwork` object. Instead of standard network compilation, it will run the tuning process.
-Please keep in mind that the tuning can be very time consuming. The bigger the network, the longer it will take.
-File with tuned data is the result of this step.
-
-> **NOTE** If a filename passed to `KEY_TUNING_FILE` points to existing tuned data and you are tuning a new model, then this file will be extended by new data. This allows you to extend existing `cache.json` provided in the OpenVINO™ release package. 
-
-The example below shows how to set and use the key files:
-
-@snippet snippets/GPU_Kernels_Tuning.cpp part0
-
---
-
-You can activate the inference with tuned data by setting `KEY_TUNING_MODE` flag to `TUNING_USE_EXISTING` and
-`KEY_TUNING_FILE` flag to `<"filename">`. 
-
-GPU backend will process the content of the file during network compilation to configure the OpenCL kernels for the best performance.
--- a/docs/IE_DG/Intro_to_Performance.md
+++ b/docs/IE_DG/Intro_to_Performance.md
@ -1,24 +1,28 @@
 # Introduction to the Performance Topics {#openvino_docs_IE_DG_Intro_to_Performance}

 This section is a shorter version of the
-[Optimization Guide](supported_plugins/MULTI.md) for the Intel Deep Learning Deployment Toolkit.
+[Optimization Guide](../optimization_guide/dldt_optimization_guide.md) for the Intel® Distribution of OpenVINO™ Toolkit.

 ## Precision
 Inference precision directly affects the performance. 

-Model Optimizer can produce an IR with different precision. For example, float16 IR initially targets VPU and GPU devices, while, for example, the CPU can also execute regular float32.
-Also, further device-specific inference precision settings are available, for example, [8-bit integer](Int8Inference.md) or [bfloat16](Bfloat16Inference.md) inference on the CPU.
-Note that for [MULTI device](supported_plugins/MULTI.md) that supports automatic inference on multiple devices in parallel, you can use the FP16 IR.
+Model Optimizer can produce an IR with different precision. For example, an FP16 IR initially targets VPU and GPU devices, while, for example, for the CPU, an FP16 IR is    typically up-scaled to the regular FP32 automatically upon loading. But notice that further device-specific inference precision settings are available, 
+for example, [8-bit integer](Int8Inference.md) or [bfloat16](Bfloat16Inference.md), which is specific to the CPU inference, below.
+Note that for the [MULTI device](supported_plugins/MULTI.md) plugin that supports automatic inference on multiple devices in parallel, you can use an FP16 IR (no need for FP32).
 You can find more information, including preferred data types for specific devices, in the
-[Supported Devices](supported_plugins/Supported_Devices.md) section.
+[Supported Devices](supported_plugins/Supported_Devices.md) document.

-## Lowering Inference Precision
-Default optimization is used for CPU and implies that inference is made with lower precision if it is possible on a given platform to reach better performance with acceptable range of accuracy.
-This approach can be used for CPU devices where the platform supports the AVX512_BF16 instruction. In this case, a regular float32 model is converted to [bfloat16](Bfloat16Inference.md) internal representation and inference is provided with bfloat16 layers usage.
-Below is the example command line to disable this feature on the CPU device with the AVX512_BF16 instruction and execute regular float32.
+## Automatic Lowering of the Inference Precision
+By default, plugins enable the optimizations that allow lower precision if the acceptable range of accuracy is preserved.
+For example, for the CPU that supports the AVX512_BF16 instructions, an FP16/FP32 model is converted to a [bfloat16](Bfloat16Inference.md) IR to accelerate inference.
+To compare the associated speedup, run the example command below to disable this feature on the CPU device with the AVX512_BF16 support and get regular FP32 execution:
 ```
 $ benchmark_app -m <model.xml> -enforcebf16=false
 ```
+Notice that for quantized (e.g. INT8) models the bfloat16 calculations (of the layers that remain in FP32) is disabled by default.
+Refer to the [CPU Plugin documentation](supported_plugins/CPU.md) for more details.
+
+Similarly, the GPU device has a dedicated config key to enable FP16 execution of the layers that remain in FP32 in the quantized models (as the quantization is typically performed on the FP32 models), refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md)

 ## Latency vs. Throughput
 One way to increase computational efficiency is batching, which combines many (potentially tens) of
@ -44,17 +48,17 @@ Below is the example command line that limits the execution to the single socket
 limited to the single socket).
 $ numactl -m 0 --physcpubind 0-27  benchmark_app -m <model.xml> -api sync -nthreads 28
 ```
-Note that if you have more than one input, running as many inference requests as you have NUMA nodes (or sockets)
+Note that if you have more than one input, running as many inference streams as you have NUMA nodes (or sockets)
 usually gives the same best latency as a single request on the single socket, but much higher throughput. Assuming two NUMA nodes machine:
 ```
 $ benchmark_app -m <model.xml> -nstreams 2
 ```
 Number of NUMA nodes on the machine can be queried via 'lscpu'.
-Please see more on the NUMA support in the [Optimization Guide](supported_plugins/MULTI.md).
+Please see more on the NUMA support in the [Optimization Guide](../optimization_guide/dldt_optimization_guide.md).

 ## Throughput Mode for CPU
 Unlike most accelerators, CPU is perceived as an inherently latency-oriented device. 
-Since 2018 R5 release, the Inference Engine introduced the "throughput" mode, which allows the Inference Engine to efficiently run multiple inference requests on the CPU simultaneously, greatly improving the throughput.
+OpenVINO™ toolkit provides a "throughput" mode that allows running multiple inference requests on the CPU simultaneously, which greatly improves the throughput.

 Internally, the execution resources are split/pinned into execution "streams".
 Using this feature gains much better performance for the networks that originally are not scaled well with a number of threads (for example, lightweight topologies). This is especially pronounced for the many-core server machines.
@ -62,8 +66,6 @@ Using this feature gains much better performance for the networks that originall
 Run the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) and play with number of infer requests running in parallel, next section. 
 Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. 

-In addition to the number of streams, it is also possible to play with the batch size to find the throughput sweet-spot.
-
 The throughput mode relaxes the requirement to saturate the CPU by using a large batch: running multiple independent inference requests in parallel often gives much better performance, than using a batch only.
 This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance.
 Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API.
@ -87,13 +89,3 @@ Try different values of the `-nstreams` argument from `1` to a number of CPU cor
 Finally, notice that when you don't specify number of streams with `-nstreams`, "AUTO" value for the streams is used, e.g. for the CPU this is [CPU_THROUGHPUT_AUTO](supported_plugins/CPU.md). You can spot the actual value behind "AUTO" for your machine in the application output.
 Notice that the "AUTO" number is not necessarily most optimal, so it is generally recommended to play either with the benchmark_app's "-nstreams" as described above, or via  [new Workbench tool](@ref workbench_docs_Workbench_DG_Introduction).This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance.
 Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API.
-
-## Kernels Tuning for GPU
-
-GPU backend comes with a feature, that allows models tuning, so the workload is configured to fit better into hardware.
-
-Tuning is time consuming process, which internally execute every layer several (or even hundreds) times to find most performant configuration.
-
-This configuration is saved into json-formatted file, whose name can be passed as plugin param to network. GPU backend will process this data to configure kernels for the best performance.
-
-For more details about Kernels Tuning and How-To please refer to [GPU Kernels Tuning](GPU_Kernels_Tuning.md). 
--- a/docs/IE_DG/supported_plugins/CL_DNN.md
+++ b/docs/IE_DG/supported_plugins/CL_DNN.md
@ -1,4 +1,4 @@
-GPU Plugin {#openvino_docs_IE_DG_supported_plugins_CL_DNN}
+GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU}
 =======

 The GPU plugin uses the Intel® Compute Library for Deep Neural Networks (clDNN) to infer deep neural networks.
@ -89,13 +89,10 @@ Some layers are executed during the load time, not during the inference. One of

 The following layers are not accelerated on the GPU and executed on the host CPU instead:
 * Proposal
-* SimplerNMS
+* NonMaxSuppression
 * PriorBox
 * DetectionOutput

-## Known Layers Limitations
-* ROIPooling is supported for 'max' value of 'method' attribute.
-
 ## Supported Configuration Parameters

 The plugin supports the configuration parameters listed below.
@ -107,31 +104,21 @@ When specifying key values as raw strings (that is, when using Python API), omit
 | `KEY_CACHE_DIR`      | `"<cache_dir>"`                    | `""`              | Specifies a directory where compiled OCL binaries can be cached. First model loading generates the cache, and all subsequent LoadNetwork calls use precompiled kernels which significantly improves load time. If empty - caching is disabled             |
 | `KEY_PERF_COUNT`      | `YES` / `NO`                    | `NO`              | Collect performance counters during inference             |
 | `KEY_CONFIG_FILE`     | `"<file1> [<file2> ...]"`         | `""`              | Load custom layer configuration files                     |
-| `KEY_DUMP_KERNELS`    | `YES` / `NO`                    | `NO`              | Dump the final kernels used for custom layers             |
-| `KEY_TUNING_MODE`     | `TUNING_DISABLED` <br /> `TUNING_CREATE` <br />  `TUNING_USE_EXISTING`            | `TUNING_DISABLED` | Disable inference kernel tuning     <br /> Create tuning file (expect much longer runtime)  <br />         Use an existing tuning file              |
-| `KEY_TUNING_FILE`     | `"<filename>"`                  | `""`              | Tuning file to create / use                               |
-| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for clDNN OpenCL queue. 0 disables the setting. |
-| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. |
-| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | clDNN graph optimizer stages dump output directory (in GraphViz format)                                     |
-| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | Final optimized clDNN OpenCL sources dump output directory                                   |
-| `KEY_GPU_THROUGHPUT_STREAMS`  | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).<br>This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_CLDNN_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low. <br>The default value is 1, which implies latency-oriented behavior.<br>`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams). <br> A positive integer value creates the requested number of streams. |
+| `KEY_GPU_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. |
+| `KEY_GPU_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. |
+| `KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS` | `YES` / `NO`                       | `YES`               | Allows using FP16+INT8 mixed precision mode, so non-quantized parts of a model will be executed in FP16 precision for FP16 IR. Does not affect quantized FP32 IRs |
+| `KEY_GPU_NV12_TWO_INPUTS` | `YES` / `NO`                       | `NO`               | Controls preprocessing logic for nv12 input. If it's set to YES, then device graph will expect that user will set biplanar nv12 blob as input wich will be directly passed to device execution graph. Otherwise, preprocessing via GAPI is used to convert NV12->BGR, thus GPU graph have to expect single input |
+| `KEY_GPU_THROUGHPUT_STREAMS`  | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).<br>This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_GPU_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low. <br>The default value is 1, which implies latency-oriented behavior.<br>`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams). <br> A positive integer value creates the requested number of streams. |
 | `KEY_EXCLUSIVE_ASYNC_REQUESTS` | `YES` / `NO`                | `NO`              | Forces async requests (also from different executable networks) to execute serially.|
-| `KEY_CLDNN_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` |  Specifies the number of CPU threads that can be used for clDNN engine, e.g, JIT compilation of clDNN kernels or clDNN cpu kernel processing. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the clDNN kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while clDNN plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of clDNN networks that are optimized with multi-threading. |
-| `KEY_CLDNN_ENABLE_LOOP_UNROLLING` | `YES` / `NO`             | `YES`             | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. |
-
-## Note on Debug Capabilities of the GPU Plugin
-
-Inference Engine GPU plugin provides possibility to dump the user custom OpenCL&trade; kernels to a file to allow you to properly debug compilation issues in your custom kernels.
-
-The application can use the <code>SetConfig()</code> function with the key <code>PluginConfigParams::KEY_DUMP_KERNELS</code> and value: <code>PluginConfigParams::YES</code>. Then during network loading, all custom layers will print their OpenCL kernels with the JIT instrumentation added by the plugin.
-The kernels will be stored in the working directory under files named the following way: <code>clDNN_program0.cl</code>, <code>clDNN_program1.cl</code>.
-
-This option is disabled by default. Additionally, the application can call the <code>SetConfig()</code> function with the key <code>PluginConfigParams::KEY_DUMP_KERNELS</code> and value: <code>PluginConfigParams::NO</code> before network loading.
-
-How to verify that this option is disabled:
-1.  Delete all <code>clDNN_program*.cl</code> files from the current directory
-2.  Run your application to load a network
-3.  Examine the working directory for the presence of any kernel file (for example, <code>clDNN_program0.cl</code>)
+| `KEY_GPU_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` |  Specifies the number of CPU threads that can be used for GPU engine, e.g, JIT compilation of GPU kernels or cpu kernel processing within GPU plugin. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the GPU kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while GPU plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of GPU networks that are optimized with multi-threading. |
+| `KEY_GPU_ENABLE_LOOP_UNROLLING` | `YES` / `NO`             | `YES`             | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. |
+| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_PRIORITY |
+| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_THROTTLE |
+| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | clDNN graph optimizer stages dump output directory (in GraphViz format) **Deprecated**. Will be removed in the next release                                     |
+| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | Final optimized clDNN OpenCL sources dump output directory. **Deprecated**. Will be removed in the next release                                   |
+| `KEY_DUMP_KERNELS`    | `YES` / `NO`                    | `NO`              | Dump the final kernels used for custom layers. **Deprecated**. Will be removed in the next release             |
+| `KEY_TUNING_MODE`     | `TUNING_DISABLED` <br /> `TUNING_CREATE` <br />  `TUNING_USE_EXISTING`            | `TUNING_DISABLED` | Disable inference kernel tuning     <br /> Create tuning file (expect much longer runtime)  <br />         Use an existing tuning file. **Deprecated**. Will be removed in the next release |
+| `KEY_TUNING_FILE`     | `"<filename>"`                  | `""`              | Tuning file to create / use. **Deprecated**. Will be removed in the next release |

 ## GPU Context and Video Memory Sharing RemoteBlob API

--- a/docs/IE_DG/supported_plugins/Supported_Devices.md
+++ b/docs/IE_DG/supported_plugins/Supported_Devices.md
@ -9,7 +9,7 @@ The Inference Engine provides unique capabilities to infer deep learning models

 | Plugin                                   | Device types                                                                                                                                                |
 |------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|[GPU plugin](CL_DNN.md)            |Intel&reg; Processor Graphics, including Intel&reg; HD Graphics and Intel&reg; Iris&reg; Graphics                                                            |
+|[GPU plugin](GPU.md)            |Intel&reg; Processor Graphics, including Intel&reg; HD Graphics and Intel&reg; Iris&reg; Graphics                                                            |
 |[CPU plugin](CPU.md)              |Intel&reg; Xeon&reg; with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel&reg; Core&trade; Processors with Intel&reg; AVX2, Intel&reg; Atom&reg; Processors with Intel® Streaming SIMD Extensions (Intel® SSE) |
 |[VPU plugins](VPU.md) (available in the Intel® Distribution of OpenVINO™ toolkit)            |Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X, Intel® Vision Accelerator Design with Intel® Movidius™ VPUs                                                                                           |
 |[GNA plugin](GNA.md) (available in the Intel® Distribution of OpenVINO™ toolkit)              |Intel&reg; Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel&reg; Pentium&reg; Silver J5005 Processor, Intel&reg; Pentium&reg; Silver N5000 Processor, Intel&reg; Celeron&reg; J4005 Processor, Intel&reg; Celeron&reg; J4105 Processor, Intel&reg; Celeron&reg; Processor N4100, Intel&reg; Celeron&reg; Processor N4000, Intel&reg; Core&trade; i3-8121U Processor, Intel&reg; Core&trade; i7-1065G7 Processor, Intel&reg; Core&trade; i7-1060G7 Processor, Intel&reg; Core&trade; i5-1035G4 Processor, Intel&reg; Core&trade; i5-1035G7 Processor, Intel&reg; Core&trade; i5-1035G1 Processor, Intel&reg; Core&trade; i5-1030G7 Processor, Intel&reg; Core&trade; i5-1030G4 Processor, Intel&reg; Core&trade; i3-1005G1 Processor, Intel&reg; Core&trade; i3-1000G1 Processor, Intel&reg; Core&trade; i3-1000G4 Processor|
--- a/docs/IE_PLUGIN_DG/ExecutableNetwork.md
+++ b/docs/IE_PLUGIN_DG/ExecutableNetwork.md
@ -49,20 +49,15 @@ The function accepts a const shared pointer to `ngraph::Function` object and per

 This constructor creates a backend specific graph by importing from a stream object:

-> **NOTE**: The export of backend specific graph is done in the `ExportImpl` method, and data formats must be the same for both import and export.
+> **NOTE**: The export of backend specific graph is done in the `Export` method, and data formats must be the same for both import and export.

@snippet src/template_executable_network.cpp executable_network:ctor_import_stream

-### `ExportImpl()`
-
-**Implementation details:**   
-Base InferenceEngine::ExecutableNetworkThreadSafeDefault class implements the public InferenceEngine::ExecutableNetworkThreadSafeDefault::Export method as following:
- Writes `_plugin->GetName()` to the `model` stream.
- Calls the `ExportImpl` method defined in a derived class to dump a backend specific graph.
+### `Export()`

 The implementation of the method should write all data to the `model` stream, which is required to import a backend specific graph later in the `Plugin::Import` method:

-@snippet src/template_executable_network.cpp executable_network:export_impl
+@snippet src/template_executable_network.cpp executable_network:export

 ### `CreateInferRequest()`

--- a/docs/IE_PLUGIN_DG/Plugin.md
+++ b/docs/IE_PLUGIN_DG/Plugin.md
@ -159,21 +159,13 @@ The snippet below provides an example of the implementation for `GetMetric`:

 > **NOTE**: If an unsupported metric key is passed to the function, it must throw an exception.

-### `ImportNetworkImpl()`
+### `ImportNetwork()`

 The importing network mechanism allows to import a previously exported backend specific graph and wrap it 
 using an [ExecutableNetwork](@ref executable_network) object. This functionality is useful if 
 backend specific graph compilation takes significant time and/or cannot be done on a target host 
 device due to other reasons.

-**Implementation details:** The base plugin class InferenceEngine::IInferencePlugin implements InferenceEngine::IInferencePlugin::ImportNetwork 
-as follows: exports a device type (InferenceEngine::IInferencePlugin::_pluginName) and then calls `ImportNetworkImpl`, 
-which is implemented in a derived class. 
-If a plugin cannot use the base implementation InferenceEngine::IInferencePlugin::ImportNetwork, it can override base 
-implementation and define an output blob structure up to its needs. This 
-can be useful if a plugin exports a blob in a special format for integration with other frameworks 
-where a common Inference Engine header from a base class implementation is not appropriate. 
-
 During export of backend specific graph using `ExecutableNetwork::Export`, a plugin may export any 
 type of information it needs to import a compiled graph properly and check its correctness. 
 For example, the export information may include:
--- a/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md
+++ b/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md
@ -628,3 +628,15 @@ It means that you trying to convert the topology which contains '_contrib_box_nm
 </script>

 \endhtmlonly
+
+#### 103. What does the message "ModelOptimizer is not able to parse *.caffemodel" mean? <a name="question-103"></a>
+
+If a '*.caffemodel' file exists and it is correct, the error possibly occured due to the use of Python protobuf implementation. In some cases, it shows error message during model parsing, for example: "'utf-8' codec can't decode byte 0xe0 in position 4: invalid continuation byte in field: mo_caffe.SpatialTransformerParameter.transform_type". You can either use Python 3.6/3.7 or build 'cpp' implementation of protobuf yourself for your version of Python. For the complete instructions about building `protobuf` from sources, see the appropriate section in [Converting a Model to Intermediate Representation](Config_Model_Optimizer.md).
+
+#### 104. What does the message "SyntaxError: 'yield' inside list comprehension" during MxNet\* model conversion mean? <a name="question-104"></a>
+
+The issue "SyntaxError: 'yield' inside list comprehension" might occur during converting MXNet\* models (mobilefacedet-v1-mxnet, brain-tumor-segmentation-0001) on Windows* platform with Python* 3.8 environment. This issue is caused by API changes for `yield expression` in Python 3.8.
+The following workarounds are suggested to resolve this issue:
+1. Use Python 3.6/3.7 to convert MXNet\* models on Windows
+2. Update MXNet: pip install mxnet=1.7.0.post2
+Note that you might have conflicts between previously installed PyPI dependencies.
--- a/docs/doxygen/ie_docs.xml
+++ b/docs/doxygen/ie_docs.xml
@ -293,7 +293,6 @@ limitations under the License.
                <tab type="user" title="[DEPRECATED] Import an ONNX model" url="@ref openvino_docs_IE_DG_OnnxImporterTutorial"/>
                <tab type="user" title="Using Dynamic Batching Feature" url="@ref openvino_docs_IE_DG_DynamicBatching"/>
                <tab type="user" title="Using Static Shape Infer Feature" url="@ref openvino_docs_IE_DG_ShapeInference"/>
-                <tab type="user" title="Using GPU kernels tuning" url="@ref openvino_docs_IE_DG_GPU_Kernels_Tuning"/>
                <tab type="usergroup" title="Using Bfloat16 Inference" url="@ref openvino_docs_IE_DG_Bfloat16Inference">
                </tab>
                <tab type="usergroup" title="Using Low-Precision 8-bit Integer Inference" url="@ref openvino_docs_IE_DG_Int8Inference">
@ -303,7 +302,7 @@ limitations under the License.
                </tab>
                <tab type="user" title="Introduction to OpenVINO state API" url="@ref openvino_docs_IE_DG_network_state_intro"/>
                <tab type="usergroup" title="Supported Devices" url="@ref openvino_docs_IE_DG_supported_plugins_Supported_Devices">
-                    <tab type="usergroup" title="GPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_CL_DNN">
+                    <tab type="usergroup" title="GPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_GPU">
                        <tab type="user" title="RemoteBlob API of GPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_GPU_RemoteBlob_API"/>
                    </tab>
                    <tab type="user" title="CPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_CPU"/>
--- a/docs/install_guides/installing-openvino-apt.md
+++ b/docs/install_guides/installing-openvino-apt.md
@ -2,7 +2,7 @@

 This guide provides installation steps for Intel® Distribution of OpenVINO™ toolkit for Linux* distributed through the APT repository.

-> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/en-us/license/eula-for-intel-software-development-products). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.
+> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/content/dam/develop/external/us/en/documents/intel-openvino-license-agreements.pdf). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.

 > **NOTE**: Intel® Graphics Compute Runtime for OpenCL™ is not a part of OpenVINO™ APT distribution. You can install it from the [Intel® Graphics Compute Runtime for OpenCL™ GitHub repo](https://github.com/intel/compute-runtime). 

--- a/docs/install_guides/installing-openvino-windows.md
+++ b/docs/install_guides/installing-openvino-windows.md
@ -248,8 +248,8 @@ Or proceed to the <a href="#get-started">Get Started</a> to get started with run

 > **NOTE**: These steps are required only if you want to use an Intel® integrated GPU.

-If your applications offload computation to **Intel® Integrated Graphics**, you must have the latest version of Intel Graphics Driver for Windows installed for your hardware. 
-[Download and install a higher version](http://downloadcenter.intel.com/product/80939/Graphics-Drivers). 
+If your applications offload computation to **Intel® Integrated Graphics**, you must have the Intel Graphics Driver for Windows installed for your hardware. 
+[Download and install the recommended version](https://downloadcenter.intel.com/download/30079/Intel-Graphics-Windows-10-DCH-Drivers). 

 To check if you have this driver installed:

@ -265,8 +265,6 @@ To check if you have this driver installed:

   ![](../img/DeviceDriverVersion.PNG)

-> **NOTE**: To use the **Intel® Iris® Xe MAX Graphics**, see the [Drivers & Software](https://downloadcenter.intel.com/download/29993/Intel-Iris-Xe-MAX-Dedicated-Graphics-Drivers?product=80939) page for driver downloads and installation instructions.  
-
 You are done updating your device driver and are ready to use your GPU. Proceed to the <a href="#get-started">Get Started</a> to get started with running code samples and demo applications.

 ### <a name="hddl-myriad"></a> Optional: Additional Installation Steps for the Intel® Vision Accelerator Design with Intel® Movidius™ VPUs
--- a/docs/install_guides/installing-openvino-yum.md
+++ b/docs/install_guides/installing-openvino-yum.md
@ -2,7 +2,7 @@

 This guide provides installation steps for the Intel® Distribution of OpenVINO™ toolkit for Linux* distributed through the YUM repository.

-> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/en-us/license/eula-for-intel-software-development-products). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.
+> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/content/dam/develop/external/us/en/documents/intel-openvino-license-agreements.pdf). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.

 > **NOTE**: Intel® Graphics Compute Runtime for OpenCL™ is not a part of OpenVINO™ YUM distribution. You can install it from the [Intel® Graphics Compute Runtime for OpenCL™ GitHub repo](https://github.com/intel/compute-runtime).

--- a/docs/model_server/README.md
+++ b/docs/model_server/README.md
@ -18,8 +18,8 @@ Review the [Architecture Concept](https://github.com/openvinotoolkit/model_serve
 A few key features:
 - Support for multiple frameworks. Serve models trained in popular formats such as Caffe\*, TensorFlow\*, MXNet\*, and ONNX*.
 - Deploy new [model versions](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-version-policy) without changing client code.
- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU), 
-[GPU](../IE_DG/supported_plugins/CL_DNN), and [HDDL](../IE_DG/supported_plugins/HDDL). 
+- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU.md),
+[GPU](../IE_DG/supported_plugins/GPU.md), and [HDDL](../IE_DG/supported_plugins/HDDL.md).
 - The server can be enabled both on [Bare Metal Hosts](https://github.com/openvinotoolkit/model_server/blob/main/docs/host.md) or in
 [Docker* containers](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md).
 - [Kubernetes deployments](https://github.com/openvinotoolkit/model_server/blob/main/deploy). The server can be deployed in a Kubernetes cluster allowing the inference service to scale horizontally and ensure high availability.
--- a/docs/ops/detection/RegionYolo_1.md
+++ b/docs/ops/detection/RegionYolo_1.md
@ -6,7 +6,7 @@

 **Short description**: *RegionYolo* computes the coordinates of regions with probability for each class.

-**Detailed description**: This operation is directly mapped to the original YOLO layer. [Reference](https://arxiv.org/pdf/1612.08242.pdf)
+**Detailed description**: This operation is directly mapped to the [YOLO9000: Better, Faster, Stronger](https://arxiv.org/pdf/1612.08242.pdf) paper. 

 **Attributes**:

@ -78,14 +78,17 @@

 **Inputs**:

-*   **1**: `data` - 4D input tensor with floating point elements and shape `[N, C, H, W]`. Required.
+*   **1**: `data` - 4D tensor of type `T` and shape `[N, C, H, W]`. **Required.**

 **Outputs**:

-*   **1**: output tensor of rank 4 or less that codes detected regions. Refer to the original YOLO paper to decode the output as boxes. `anchors` should be used to decode real box coordinates. If `do_softmax` is set to 0, then the output shape is `[N, (classes + coords + 1)*len(mask), H, W]`. If `do_softmax` is set to 1, then output shape is partially flattened and defined in the following way:
+*   **1**: tensor of type `T` and rank 4 or less that codes detected regions. Refer to the [YOLO9000: Better, Faster, Stronger](https://arxiv.org/pdf/1612.08242.pdf) paper to decode the output as boxes. `anchors` should be used to decode real box coordinates. If `do_softmax` is set to `0`, then the output shape is `[N, (classes + coords + 1) * len(mask), H, W]`. If `do_softmax` is set to `1`, then output shape is partially flattened and defined in the following way:

-    flat_dim = data.shape[axis] * data.shape[axis+1] * ... * data.shape[end_axis]
-    output.shape = [data.shape[0], ..., data.shape[axis-1], flat_dim, data.shape[end_axis + 1], ...]
+    `flat_dim = data.shape[axis] * data.shape[axis+1] * ... * data.shape[end_axis]`  
+    `output.shape = [data.shape[0], ..., data.shape[axis-1], flat_dim, data.shape[end_axis + 1], ...]`
+
+**Types**
+* *T*: any supported floating point type.

 **Example**

--- a/docs/optimization_guide/dldt_optimization_guide.md
+++ b/docs/optimization_guide/dldt_optimization_guide.md
@ -187,7 +187,7 @@ Inference Engine relies on the [Compute Library for Deep Neural Networks (clDNN)
 -	In the GPU-only scenario, a GPU driver might occupy a CPU core with spin-looped polling for completion. If the _CPU_ utilization is a concern, consider the `KEY_CLDND_PLUGIN_THROTTLE` configuration option.

 > **NOTE**: See the [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) code for a usage example.
-Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/CL_DNN.md). 
+Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/GPU.md).


 ### Intel&reg; Movidius&trade; Myriad&trade; X Visual Processing Unit and Intel&reg; Vision Accelerator Design with Intel&reg; Movidius&trade; VPUs  <a name="myriad"></a>
--- a/docs/snippets/GPU_Kernel.cpp
+++ b/docs/snippets/GPU_Kernel.cpp
@ -1,5 +1,4 @@
 #include <ie_core.hpp>
-#include "cldnn/cldnn_config.hpp"

 int main() {
 using namespace InferenceEngine;
@ -9,9 +8,5 @@ InferenceEngine::Core core;
 core.SetConfig({ { InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, "<path_to_the_xml_file>" } }, "GPU");
 //! [part0]

-//! [part1]
-core.SetConfig({ { PluginConfigParams::KEY_DUMP_KERNELS, PluginConfigParams::YES } }, "GPU");
-//! [part1]
-
 return 0;
 }
--- a/docs/snippets/GPU_Kernels_Tuning.cpp
+++ b/docs/snippets/GPU_Kernels_Tuning.cpp
@ -1,14 +0,0 @@
-#include <ie_core.hpp>
-#include "cldnn/cldnn_config.hpp"
-
-int main() {
-using namespace InferenceEngine;
-//! [part0]
-Core ie;          
-  ie.SetConfig({{ CONFIG_KEY(TUNING_MODE), CONFIG_VALUE(TUNING_CREATE) }}, "GPU");
-  ie.SetConfig({{ CONFIG_KEY(TUNING_FILE), "/path/to/tuning/file.json" }}, "GPU");
-  // Further LoadNetwork calls will use the specified tuning parameters
-//! [part0]
-
-return 0;
-}
--- a/docs/snippets/GPU_RemoteBlob_API2.cpp
+++ b/docs/snippets/GPU_RemoteBlob_API2.cpp
@ -1,6 +1,6 @@
 #include <ie_core.hpp>
 #include <gpu/gpu_context_api_va.hpp>
-#include <cldnn/cldnn_config.hpp>
+#include <gpu/gpu_config.hpp>


 int main() {
@ -28,7 +28,7 @@ auto shared_va_context = gpu::make_shared_context(ie, "GPU", disp);
 // compile network within a shared context
 ExecutableNetwork executable_network = ie.LoadNetwork(network,
                                                      shared_va_context,
-                                                      { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS,
+                                                      { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS,
                                                          PluginConfigParams::YES } });


--- a/docs/snippets/InferenceEngine_network_with_state_infer.cpp
+++ b/docs/snippets/InferenceEngine_network_with_state_infer.cpp
@ -64,7 +64,13 @@ int main(int argc, char *argv[]) {
            inferRequest.Infer();
            // check states
            auto states = inferRequest.QueryState();
+            if (states.empty()) {
+                throw std::runtime_error("Queried states are empty");
+            }
            auto mstate = as<MemoryBlob>(states[0].GetState());
+            if (mstate == nullptr) {
+                throw std::runtime_error("Can't cast state to MemoryBlob");
+            }
            auto state_buf = mstate->rmap();
            float * state =state_buf.as<float*>(); 
            std::cout << state[0] << "\n";
--- a/docs/template_plugin/src/template_executable_network.cpp
+++ b/docs/template_plugin/src/template_executable_network.cpp
@ -175,9 +175,9 @@ InferenceEngine::Parameter TemplatePlugin::ExecutableNetwork::GetMetric(const st
 }
 // ! [executable_network:get_metric]

-// ! [executable_network:export_impl]
-void TemplatePlugin::ExecutableNetwork::ExportImpl(std::ostream& modelStream) {
-    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::ExportImpl");
+// ! [executable_network:export]
+void TemplatePlugin::ExecutableNetwork::Export(std::ostream& modelStream) {
+    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::Export");

    // Note: custom ngraph extensions are not supported
    std::map<std::string, ngraph::OpSet> custom_opsets;
@ -198,4 +198,4 @@ void TemplatePlugin::ExecutableNetwork::ExportImpl(std::ostream& modelStream) {

    // TODO: implement network precision, layout, preprocessing info serialization
 }
-// ! [executable_network:export_impl]
+// ! [executable_network:export]
--- a/docs/template_plugin/src/template_executable_network.hpp
+++ b/docs/template_plugin/src/template_executable_network.hpp
@ -30,7 +30,7 @@ public:

    // Methods from a base class ExecutableNetworkThreadSafeDefault

-    void ExportImpl(std::ostream& model) override;
+    void Export(std::ostream& model) override;
    InferenceEngine::IInferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                       InferenceEngine::OutputsDataMap networkOutputs) override;
    InferenceEngine::IInferRequestInternal::Ptr CreateInferRequest() override;
--- a/docs/template_plugin/src/template_plugin.cpp
+++ b/docs/template_plugin/src/template_plugin.cpp
@ -95,14 +95,14 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(cons
 }
 // ! [plugin:load_exe_network_impl]

-// ! [plugin:import_network_impl]
-InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::ImportNetworkImpl(std::istream& modelStream, const std::map<std::string, std::string>& config) {
-    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetworkImpl");
+// ! [plugin:import_network]
+InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::ImportNetwork(std::istream& modelStream, const std::map<std::string, std::string>& config) {
+    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetwork");

    auto fullConfig = Configuration {config, _cfg};
    return std::make_shared<ExecutableNetwork>(modelStream, fullConfig, std::static_pointer_cast<Plugin>(shared_from_this()));
 }
-// ! [plugin:import_network_impl]
+// ! [plugin:import_network]

 // ! [plugin:query_network]
 InferenceEngine::QueryNetworkResult Plugin::QueryNetwork(const InferenceEngine::CNNNetwork& network, const ConfigMap& config) const {
--- a/docs/template_plugin/src/template_plugin.hpp
+++ b/docs/template_plugin/src/template_plugin.hpp
@ -28,7 +28,7 @@ public:
    void AddExtension(const std::shared_ptr<InferenceEngine::IExtension>& extension) override;
    InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const override;
    InferenceEngine::Parameter GetMetric(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const override;
-    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) override;
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& model, const std::map<std::string, std::string>& config) override;

 private:
    friend class ExecutableNetwork;
--- a/inference-engine/cmake/dependencies.cmake
+++ b/inference-engine/cmake/dependencies.cmake
@ -295,25 +295,25 @@ if (ENABLE_SPEECH_DEMO)
    if(DEFINED IE_PATH_TO_DEPS)
        if (WIN32 AND X86_64)
            RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS
-                    ARCHIVE_WIN "speech_demo_1.0.0.755_windows.zip"
+                    ARCHIVE_WIN "speech_demo_1.0.0.774_windows.zip"
                    VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*"
-                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.755"
-                    SHA256 "58adef14b8a749f70fa83888614cee34b941956e6e958e445e3f48885b3c20a0")
+                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.774"
+                    SHA256 "67b25170be5e89a4f0e90e8b39623b60c9a15b965c30329385e295fcd2edc856")
            debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS})
        elseif (LINUX AND X86_64)
            if (LINUX_OS_NAME STREQUAL "CentOS 7" OR CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
                RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS
-                    ARCHIVE_LIN "speech_demo_1.0.0.755_centos.tgz"
+                    ARCHIVE_LIN "speech_demo_1.0.0.774_centos.tgz"
                    VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*"
-                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.755"
-                    SHA256 "716201e377714ac50f3909c445d36d47a089de50a557d8ef65232de040671188")
+                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.774"
+                    SHA256 "5ec3b7be9ae05376aefae5bd5fd4a39b12c274e82817fd3218120b8e8fc8ff5a")
                debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS})
            else()
                RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS
-                    ARCHIVE_LIN "speech_demo_1.0.0.755_linux.tgz"
+                    ARCHIVE_LIN "speech_demo_1.0.0.774_linux.tgz"
                    VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*"
-                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.755"
-                    SHA256 "7714b8776ec0183ed73eed6d3d965ee6d5c15d2dc49ee5ae118cc368c89c7a9d")
+                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.774"
+                    SHA256 "f0bbd0a6218b0365e7cfb1f860b34e4ace7e0d47dd60b369cdea8a480329810f")
                debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS})
            endif()
        else()
--- a/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md
+++ b/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md
@ -1,7 +1,8 @@
 # nGraph Function Creation Python* Sample {#openvino_inference_engine_ie_bridges_python_sample_ngraph_function_creation_sample_README}

-This sample demonstrates how to execute an inference using [nGraph function feature](../../../../../docs/nGraph_DG/build_function.md) to create a network that uses weights from LeNet classification network. So you don't need an XML file, the model will be created from the source code on the fly.  
-In addition to regular images, the sample also supports single-channel ubyte images as an input.
+This sample demonstrates how to execute an inference using [nGraph function feature](../../../../../docs/nGraph_DG/build_function.md) to create a network that uses weights from LeNet classification network, which is known to work well on digit classification tasks. So you don't need an XML file, the model will be created from the source code on the fly.  
+
+In addition to regular grayscale images with a digit, the sample also supports single-channel `ubyte` images as an input.

 The following Inference Engine Python API is used in the application:

@ -14,6 +15,9 @@ Basic Inference Engine API is covered by [Hello Classification Python* Sample](.

 | Options                    | Values                                                                  |
 | :------------------------- | :---------------------------------------------------------------------- |
+| Validated Models           | LeNet (image classification network)                                    |
+| Model Format               | Network weights file (\*.bin)                                           |
+| Validated images           | The sample uses OpenCV\* to [read input grayscale image](https://docs.opencv.org/master/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56) (\*.bmp, \*.png) or single-channel `ubyte` image                                          |
 | Supported devices          | [All](../../../../../docs/IE_DG/supported_plugins/Supported_Devices.md) |
 | Other language realization | [C++](../../../../samples/ngraph_function_creation_sample)              |

@ -72,7 +76,7 @@ To run the sample, you need specify a model weights and image:
 You can do inference of an image using a pre-trained model on a GPU using the following command:

 ```sh
-python ngraph_function_creation_sample.py -m <path_to_model>/lenet.bin -i <path_to_image>/3.bmp -d GPU
+python ngraph_function_creation_sample.py -m <path_to_model>/lenet.bin -i <path_to_image>/3.png -d GPU
 ```

 ## Sample Output
@ -84,10 +88,10 @@ The sample application logs each step in a standard output stream and outputs to
 [ INFO ] Loading the network using ngraph function with weights from <path_to_model>/lenet.bin
 [ INFO ] Configuring input and output blobs
 [ INFO ] Loading the model to the plugin
-[ WARNING ] <path_to_image>/3.bmp is inverted to white over black
-[ WARNING ] <path_to_image>/3.bmp is resized from (100, 100) to (28, 28)
+[ WARNING ] <path_to_image>/3.png is inverted to white over black
+[ WARNING ] <path_to_image>/3.png is is resized from (351, 353) to (28, 28)
 [ INFO ] Starting inference in synchronous mode
-[ INFO ] Image path: <path_to_image>/3.bmp
+[ INFO ] Image path: <path_to_image>/3.png
 [ INFO ] Top 10 results:
 [ INFO ] classid probability
 [ INFO ] -------------------
--- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
@ -77,4 +77,5 @@ install(PROGRAMS __init__.py
        DESTINATION ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}/openvino/inference_engine
        COMPONENT ${PYTHON_VERSION})

-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}
+                        EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx")
--- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd
+++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd
@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0

 from .cimport ie_api_impl_defs as C
-from .ie_api_impl_defs cimport CBlob, CTensorDesc, InputInfo, CPreProcessChannel, CPreProcessInfo, CExecutableNetwork
+from .ie_api_impl_defs cimport CBlob, CTensorDesc, InputInfo, CPreProcessChannel, CPreProcessInfo, CExecutableNetwork, CVariableState

 import os

--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt
@ -42,7 +42,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    target_compile_options(${TARGET_NAME} PRIVATE "-Wno-error=register")
 endif()

-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}
+                        EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx")

 # perform copy
 add_custom_command(TARGET ${TARGET_NAME}
--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx
@ -17,8 +17,8 @@ def ApplyPOTTransformations(IENetwork network, string device):
    C.ApplyPOTTransformations(network.impl, device)


-def ApplyLowLatencyTransformation(IENetwork network, int64_t num_iterations=1):
-    C.ApplyLowLatencyTransformation(network.impl, num_iterations)
+def ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer = True):
+    C.ApplyLowLatencyTransformation(network.impl, use_const_initializer)


 def ApplyPruningTransformation(IENetwork network):
--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp
@ -26,16 +26,9 @@ void InferenceEnginePython::ApplyPOTTransformations(InferenceEnginePython::IENet
    manager.run_passes(network.actual->getFunction());
 }

-void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, int64_t num_iterations) {
+void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer) {
    ngraph::pass::Manager manager;
-    // TODO: pass num_iterations to LowLatency
-    manager.register_pass<ngraph::pass::LowLatency>();
-    manager.register_pass<ngraph::pass::UnrollTensorIterator>();
-
-    auto pass_config = manager.get_pass_config();
-    pass_config->set_callback<ngraph::pass::UnrollTensorIterator>([](const std::shared_ptr<const ngraph::Node>& node) -> bool {
-        return node->get_rt_info().count("UNROLL_TI") == 0;
-    });
+    manager.register_pass<ngraph::pass::LowLatency2>(use_const_initializer);
    manager.run_passes(network.actual->getFunction());
 }

--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp
@ -15,7 +15,7 @@ void ApplyMOCTransformations(InferenceEnginePython::IENetwork network, bool cf);

 void ApplyPOTTransformations(InferenceEnginePython::IENetwork network, std::string device);

-void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, int64_t num_iterations);
+void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer = true);

 void ApplyPruningTransformation(InferenceEnginePython::IENetwork network);

--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd
@ -3,7 +3,6 @@

 from libcpp cimport bool
 from libcpp.string cimport string
-from libc.stdint cimport int64_t

 from ..inference_engine.ie_api_impl_defs cimport IENetwork

@ -12,7 +11,7 @@ cdef extern from "offline_transformations_api_impl.hpp" namespace "InferenceEngi

    cdef void ApplyPOTTransformations(IENetwork network, string device)

-    cdef void ApplyLowLatencyTransformation(IENetwork network, int64_t num_iterations)
+    cdef void ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer)

    cdef void ApplyPruningTransformation(IENetwork network)

--- a/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt
@ -48,4 +48,5 @@ add_custom_command(TARGET ${TARGET_NAME}
    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/test_utils/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py
 )

-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}
+                        EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx")
--- a/inference-engine/ie_bridges/python/tests/test_InferRequest.py
+++ b/inference-engine/ie_bridges/python/tests/test_InferRequest.py
@ -16,6 +16,20 @@ test_net_xml, test_net_bin = model_path(is_myriad)
 path_to_img = image_path()


+def create_function_with_memory(input_shape, data_type):
+    import ngraph as ng
+    from ngraph.impl import Function, Type
+
+    input_data = ng.parameter(input_shape, name="input_data", dtype=data_type)
+    rv = ng.read_value(input_data, "var_id_667")
+    add = ng.add(rv, input_data, name="MemoryAdd")
+    node = ng.assign(add, "var_id_667")
+    res = ng.result(add, "res")
+    func = Function(results=[res], sinks=[node], parameters=[input_data], name="name")
+    caps = Function.to_capsule(func)
+    return caps
+
+
 def read_image():
    import cv2
    n, c, h, w = (1, 3, 32, 32)
@ -525,28 +539,56 @@ def test_resize_algorithm_work(device):
    assert np.allclose(res_1, res_2, atol=1e-2, rtol=1e-2)


-# issue 56653
-@pytest.mark.skip(reason="Test will enable when nGraph Python API allows to create network with memory")
-def test_query_state(device):
-    import ngraph as ng
-    from ngraph.impl import Function
-    input_data = ng.parameter([5, 7], name="input_data", dtype=np.float32)
-    rv = ng.read_value(input_data, "var_id_667")
-    #a = ng.add(rv, input_data)
-    node = ng.assign(rv, "var_id_667")
-    res = ng.result(rv, "res")
-    func = Function([res], sinks=[node], parameters=[input_data], name='test')
-    caps = Function.to_capsule(func)
+@pytest.mark.parametrize("mode", ["set_init_memory_state", "reset_memory_state", "normal"])
+@pytest.mark.parametrize("data_type", ["FP32", "FP16", "I32"])
+@pytest.mark.parametrize("input_shape", [[10], [10, 10], [10, 10, 10], [2, 10, 10, 10]])
+@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU",
+                    reason=f"Can't run test on device {os.environ.get('TEST_DEVICE', 'CPU')}, "
+                    "Memory layers fully supported only on CPU")
+def test_query_state_write_buffer(device, input_shape, data_type, mode):
+    ie_core = ie.IECore()
+    if device == "CPU":
+        if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin")

-    net = ie.IENetwork(caps)
+    layout = ["C", "HW", "CHW", "NCHW"]
+    np_data_type = {"FP32": np.float32, "FP16": np.float16, "I32": np.int32}
+
+    from openvino.inference_engine import TensorDesc, Blob
+
+    net = ie.IENetwork(create_function_with_memory(input_shape, np_data_type[data_type]))
    ie_core = ie.IECore()
    exec_net = ie_core.load_network(network=net, device_name=device, num_requests=1)
    request = exec_net.requests[0]
    mem_states = request.query_state()
    mem_state = mem_states[0]
-    with pytest.raises(ValueError) as e:
-        ones_arr = np.ones(shape=(1, 800), dtype=np.float32)
-        mem_state.state.buffer[:] = ones_arr
-    assert "assignment destination is read-only" in str(e.value)
-    assert mem_state.name == 'id_1'
-    assert mem_state.state.tensor_desc.precision == 'FP32'
+
+    assert mem_state.name == 'var_id_667'
+    # todo: Uncomment after fix 45611,
+    #  CPU plugin returns outputs and memory state in FP32 in case of FP16 original precision
+    #assert mem_state.state.tensor_desc.precision == data_type
+
+    for i in range(1, 10):
+        if mode == "set_init_memory_state":
+            # create initial value
+            const_init = 5
+            init_array = np.full(input_shape, const_init, dtype=np_data_type[mem_state.state.tensor_desc.precision])
+            tensor_desc = TensorDesc(mem_state.state.tensor_desc.precision, input_shape, layout[len(input_shape) - 1])
+            blob = Blob(tensor_desc, init_array)
+            mem_state.state = blob
+
+            res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])})
+            expected_res = np.full(input_shape, 1 + const_init, dtype=np_data_type[data_type])
+        elif mode == "reset_memory_state":
+            # reset initial state of ReadValue to zero
+            mem_state.reset()
+            res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])})
+
+            # always ones
+            expected_res = np.full(input_shape, 1, dtype=np_data_type[data_type])
+        else:
+            res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])})
+            expected_res = np.full(input_shape, i, dtype=np_data_type[data_type])
+
+        assert np.allclose(res['MemoryAdd'], expected_res, atol=1e-6), \
+            "Expected values: {} \n Actual values: {} \n".format(expected_res, res)
--- a/inference-engine/include/cldnn/cldnn_config.hpp
+++ b/inference-engine/include/cldnn/cldnn_config.hpp
@ -11,47 +11,11 @@
 #pragma once

 #include "ie_plugin_config.hpp"
+#include "ie_api.h"
+#include "gpu/gpu_config.hpp"

 namespace InferenceEngine {

-namespace Metrics {
-
-/**
- * @def GPU_METRIC_KEY(name)
- * @brief shortcut for defining GPU plugin metrics
- */
-#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name)
-#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__)
-
-/**
- * @def DECLARE_GPU_METRIC_VALUE(name)
- * @brief shortcut for defining gpu metric values
- */
-#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name)
-
-/**
- * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size
- */
-DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t);
-
-/**
- * @brief Metric to get microarchitecture identifier in major.minor.revision format
- */
-DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string);
-
-/**
- * @brief Metric to get count of execution units for current GPU
- */
-DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int);
-
-/**
- * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric
- *  - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication
- */
-DECLARE_GPU_METRIC_VALUE(HW_MATMUL);
-
-}  // namespace Metrics
-
 /**
 * @brief GPU plugin configuration
 */
@ -70,6 +34,7 @@ namespace CLDNNConfigParams {
 * this option should be used with an unsigned integer value (1 is lowest priority)
 * 0 means no priority hint is set and default queue is created.
 */
+INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_PRIORITY instead")
 DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY);

 /**
@ -78,22 +43,26 @@ DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY);
 * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption)
 * 0 means no throttle hint is set and default queue created.
 */
+INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_THROTTLE instead")
 DECLARE_CLDNN_CONFIG_KEY(PLUGIN_THROTTLE);

 /**
 * @brief This key controls clDNN memory pool optimization.
 * Turned off by default.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CLDNN_CONFIG_KEY(MEM_POOL);

 /**
 * @brief This key defines the directory name to which clDNN graph visualization will be dumped.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR);

 /**
 * @brief This key defines the directory name to which full program sources will be dumped.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR);

 /**
@ -108,43 +77,19 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
 * @brief This key should be set to correctly handle NV12 input without pre-processing.
 * Turned off by default.
 */
+INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_NV12_TWO_INPUTS instead")
 DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS);

-/**
- * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
- * Default value is maximum number of threads available in the environment.
- */
-DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS);
-
-/**
- * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count.
- * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb).
- * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16).
- * Note that turning this key on will increase the graph loading time in proportion to the iteration counts.
- * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/
-DECLARE_CLDNN_CONFIG_KEY(ENABLE_LOOP_UNROLLING);
-
 }  // namespace CLDNNConfigParams

 namespace PluginConfigParams {

-/**
- * @brief Optimize GPU plugin execution to maximize throughput.
- *
- * It is passed to Core::SetConfig(), this option should be used with values:
- * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases,
- *   this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance
- * drop
- * - a positive integer value creates the requested number of streams
- */
-DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO);
-DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS);
-
 /**
 * @brief This key enables dumping of the kernels used by the plugin for custom layers.
 *
 * This option should be used with values: PluginConfigParams::YES or PluginConfigParams::NO (default)
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CONFIG_KEY(DUMP_KERNELS);

 /**
@ -159,17 +104,24 @@ DECLARE_CONFIG_KEY(DUMP_KERNELS);
 *
 * For values TUNING_CREATE and TUNING_RETUNE the file will be created if it does not exist.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CONFIG_KEY(TUNING_MODE);

+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_CREATE);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_USE_EXISTING);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_DISABLED);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_UPDATE);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_RETUNE);

 /**
 * @brief This key defines the tuning data filename to be created/used
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CONFIG_KEY(TUNING_FILE);

 }  // namespace PluginConfigParams
--- a/inference-engine/include/gpu/gpu_config.hpp
+++ b/inference-engine/include/gpu/gpu_config.hpp
@ -0,0 +1,120 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief A header for advanced hardware related properties for GPU plugin
+ *        To use in SetConfig() method of plugins
+ *
+ * @file gpu_config.hpp
+ */
+#pragma once
+
+#include "ie_plugin_config.hpp"
+
+namespace InferenceEngine {
+
+namespace Metrics {
+
+/**
+ * @def GPU_METRIC_KEY(name)
+ * @brief shortcut for defining GPU plugin metrics
+ */
+#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name)
+#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__)
+
+/**
+ * @def DECLARE_GPU_METRIC_VALUE(name)
+ * @brief shortcut for defining gpu metric values
+ */
+#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name)
+
+/**
+ * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size
+ */
+DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t);
+
+/**
+ * @brief Metric to get microarchitecture identifier in major.minor.revision format
+ */
+DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string);
+
+/**
+ * @brief Metric to get count of execution units for current GPU
+ */
+DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int);
+
+/**
+ * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric
+ *  - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication
+ */
+DECLARE_GPU_METRIC_VALUE(HW_MATMUL);
+
+}  // namespace Metrics
+
+/**
+ * @brief GPU plugin configuration
+ */
+namespace GPUConfigParams {
+
+/**
+ * @brief shortcut for defining configuration keys
+ */
+#define GPU_CONFIG_KEY(name) InferenceEngine::GPUConfigParams::_CONFIG_KEY(GPU_##name)
+#define DECLARE_GPU_CONFIG_KEY(name) DECLARE_CONFIG_KEY(GPU_##name)
+#define DECLARE_GPU_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(GPU_##name)
+
+/**
+ * @brief This key instructs the GPU plugin to use the OpenCL queue priority hint
+ * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf
+ * this option should be used with an unsigned integer value (1 is lowest priority)
+ * 0 means no priority hint is set and default queue is created.
+ */
+DECLARE_GPU_CONFIG_KEY(PLUGIN_PRIORITY);
+
+/**
+ * @brief This key instructs the GPU plugin to use throttle hints the OpenCL queue throttle hint
+ * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf,
+ * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption)
+ * 0 means no throttle hint is set and default queue created.
+ */
+DECLARE_GPU_CONFIG_KEY(PLUGIN_THROTTLE);
+
+/**
+ * @brief This key should be set to correctly handle NV12 input without pre-processing.
+ * Turned off by default.
+ */
+DECLARE_GPU_CONFIG_KEY(NV12_TWO_INPUTS);
+
+/**
+ * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
+ * Default value is maximum number of threads available in the environment.
+ */
+DECLARE_GPU_CONFIG_KEY(MAX_NUM_THREADS);
+
+/**
+ * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count.
+ * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb).
+ * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16).
+ * Note that turning this key on will increase the graph loading time in proportion to the iteration counts.
+ * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/
+DECLARE_GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING);
+
+}  // namespace GPUConfigParams
+
+namespace PluginConfigParams {
+
+/**
+ * @brief Optimize GPU plugin execution to maximize throughput.
+ *
+ * It is passed to Core::SetConfig(), this option should be used with values:
+ * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases,
+ *   this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance
+ * drop
+ * - a positive integer value creates the requested number of streams
+ */
+DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO);
+DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS);
+}  // namespace PluginConfigParams
+
+}  // namespace InferenceEngine
--- a/inference-engine/include/ie_core.hpp
+++ b/inference-engine/include/ie_core.hpp
@ -174,9 +174,18 @@ public:
     * operation*
     * @return An executable network reference
     */
-    ExecutableNetwork ImportNetwork(std::istream& networkModel, const std::string& deviceName = {},
+    ExecutableNetwork ImportNetwork(std::istream& networkModel, const std::string& deviceName,
                                    const std::map<std::string, std::string>& config = {});

+    /**
+     * @deprecated Use Core::ImportNetwork with explicit device name
+     * @brief Creates an executable network from a previously exported network
+     * @param networkModel network model stream
+     * @return An executable network reference
+     */
+    INFERENCE_ENGINE_DEPRECATED("Use Core::ImportNetwork with explicit device name")
+    ExecutableNetwork ImportNetwork(std::istream& networkModel);
+
    /**
     * @brief Creates an executable network from a previously exported network within a specified
     * remote context.
--- a/inference-engine/include/ie_transformations.hpp
+++ b/inference-engine/include/ie_transformations.hpp
@ -52,5 +52,41 @@ namespace InferenceEngine {
 * @param network A network to apply LowLatency transformation
 * *
 */
+
+INFERENCE_ENGINE_DEPRECATED("This transformation will be removed in 2023.1. "
+                            "Use InferenceEngine::lowLatency2 instead.")
 INFERENCE_ENGINE_API_CPP(void) LowLatency(InferenceEngine::CNNNetwork& network);
+
+
+/**
+ * @brief The transformation finds all TensorIterator/Loop layers in the network,
+ * processes all back edges that describe a connection between Result and Parameter
+ * of the TensorIterator/Loop bodies,and inserts ReadValue and Assign layers at the
+ * input and output corresponding to this back edge.
+ * Supported platforms: CPU, GNA.
+ *
+ * The example below describes the changes made by the transformation
+ *  [] - TensorIterator body
+ *  () - new layer
+ *  BE - back-edge
+ *
+ *  before applying the transformation:
+ *  -> input1[BE_1 -> Parameter -> Layers ... -> Result  -> BE_1 ]output1->
+ *
+ *  after applying the transformation:
+ *  ->(ReadValue)-> input1[BE_1 ->Parameter->Layers ...->Result->BE_1]output1 ->(Assign)
+ *                                                                      \
+ *                                                                       ->...
+ * After applying the transformation, the resulting network can be inferred
+ * step by step, the states will store between inferences.
+ * @param network A network to apply LowLatency transformation
+ * @param use_const_initializer Changes the type of the initializing subgraph for ReadValue operations.
+          If "true", then the transformation inserts Constant before ReadValue operation.
+          If "false, then the transformation leaves existed initializing subgraph for ReadValue operation.
+ * Loop operation by a given number. Does not affect TensorIterators.
+ * *
+ */
+INFERENCE_ENGINE_API_CPP(void) lowLatency2(InferenceEngine::CNNNetwork& network,
+                                           bool use_const_initializer = true);
+
 } // namespace InferenceEngine
--- a/inference-engine/samples/benchmark_app/inputs_filling.cpp
+++ b/inference-engine/samples/benchmark_app/inputs_filling.cpp
@ -39,6 +39,7 @@ std::vector<std::string> filterFilesByExtensions(const std::vector<std::string>&
    return filtered;
 }

+template <typename T>
 void fillBlobImage(Blob::Ptr& inputBlob, const std::vector<std::string>& filePaths, const size_t& batchSize, const benchmark_app::InputInfo& app_info,
                   const size_t& requestId, const size_t& inputId, const size_t& inputSize) {
    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
@ -50,7 +51,7 @@ void fillBlobImage(Blob::Ptr& inputBlob, const std::vector<std::string>& filePat
    // locked memory holder should be alive all time while access to its buffer
    // happens
    auto minputHolder = minput->wmap();
-    auto inputBlobData = minputHolder.as<uint8_t*>();
+    auto inputBlobData = minputHolder.as<T*>();

    /** Collect images data ptrs **/
    std::vector<std::shared_ptr<uint8_t>> vreader;
@ -90,7 +91,7 @@ void fillBlobImage(Blob::Ptr& inputBlob, const std::vector<std::string>& filePat
                    size_t offset = imageId * numChannels * width * height + (((app_info.layout == "NCHW") || (app_info.layout == "CHW"))
                                                                                  ? (ch * width * height + h * width + w)
                                                                                  : (h * width * numChannels + w * numChannels + ch));
-                    inputBlobData[offset] = vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch];
+                    inputBlobData[offset] = static_cast<T>(vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch]);
                }
            }
        }
@ -142,7 +143,7 @@ using uniformDistribution =
                              typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;

 template <typename T, typename T2>
-void fillBlobRandom(Blob::Ptr& inputBlob, T rand_min = std::numeric_limits<T>::min(), T rand_max = std::numeric_limits<T>::max()) {
+void fillBlobRandom(Blob::Ptr& inputBlob, T rand_min = std::numeric_limits<uint8_t>::min(), T rand_max = std::numeric_limits<uint8_t>::max()) {
    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
    if (!minput) {
        IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in "
@ -270,7 +271,19 @@ void fillBlobs(const std::vector<std::string>& inputFiles, const size_t& batchSi
            if (app_info.isImage()) {
                if (!imageFiles.empty()) {
                    // Fill with Images
-                    fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    if (precision == InferenceEngine::Precision::FP32) {
+                        fillBlobImage<float>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::FP16) {
+                        fillBlobImage<short>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::I32) {
+                        fillBlobImage<int32_t>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::I64) {
+                        fillBlobImage<int64_t>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::U8) {
+                        fillBlobImage<uint8_t>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else {
+                        IE_THROW() << "Input precision is not supported for " << item.first;
+                    }
                    continue;
                }
            } else {
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@ -4,8 +4,8 @@

 #include <algorithm>
 #include <chrono>
-#include <cldnn/cldnn_config.hpp>
 #include <gna/gna_config.hpp>
+#include <gpu/gpu_config.hpp>
 #include <inference_engine.hpp>
 #include <map>
 #include <memory>
@ -282,7 +282,7 @@ int main(int argc, char* argv[]) {
                               << "which releases another CPU thread (that is otherwise "
                                  "used by the GPU driver for active polling)"
                               << slog::endl;
-                    device_config[CLDNN_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
+                    device_config[GPU_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
                }
            } else if (device == "MYRIAD") {
                device_config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING);
--- a/inference-engine/samples/hello_query_device/README.md
+++ b/inference-engine/samples/hello_query_device/README.md
@ -63,20 +63,20 @@ Available devices:
                SUPPORTED_METRICS : [ AVAILABLE_DEVICES SUPPORTED_METRICS FULL_DEVICE_NAME OPTIMIZATION_CAPABILITIES SUPPORTED_CONFIG_KEYS RANGE_FOR_ASYNC_INFER_REQUESTS RANGE_FOR_STREAMS ]
                FULL_DEVICE_NAME : Intel(R) UHD Graphics 620 (iGPU)
                OPTIMIZATION_CAPABILITIES : [ FP32 BIN FP16 ]
-                SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR CLDNN_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR CLDNN_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ]
+                SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR GPU_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR GPU_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ]
                RANGE_FOR_ASYNC_INFER_REQUESTS : { 1, 2, 1 }
                RANGE_FOR_STREAMS : { 1, 2 }
        Default values for device configuration keys:
                CACHE_DIR : ""
                CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS : YES
                CLDNN_GRAPH_DUMPS_DIR : ""
-                CLDNN_MAX_NUM_THREADS : 8
                CLDNN_MEM_POOL : YES
                CLDNN_NV12_TWO_INPUTS : NO
                CLDNN_PLUGIN_PRIORITY : 0
                CLDNN_PLUGIN_THROTTLE : 0
                CLDNN_SOURCES_DUMPS_DIR : ""
-                CLDNN_ENABLE_LOOP_UNROLLING : YES
+                GPU_MAX_NUM_THREADS : 8
+                GPU_ENABLE_LOOP_UNROLLING : YES
                CONFIG_FILE : ""
                DEVICE_ID : ""
                DUMP_KERNELS : NO
--- a/inference-engine/samples/ngraph_function_creation_sample/README.md
+++ b/inference-engine/samples/ngraph_function_creation_sample/README.md
@ -1,6 +1,6 @@
 # nGraph Function Creation C++ Sample {#openvino_inference_engine_samples_ngraph_function_creation_sample_README}

-This sample demonstrates how to execute an synchronous inference using [nGraph function feature](../../../docs/nGraph_DG/build_function.md) to create a network, which uses weights from LeNet classification network.
+This sample demonstrates how to execute an synchronous inference using [nGraph function feature](../../../docs/nGraph_DG/build_function.md) to create a network, which uses weights from LeNet classification network, which is known to work well on digit classification tasks.

 The sample supports only single-channel `ubyte` images as an input.

--- a/inference-engine/samples/speech_sample/fileutils.cpp
+++ b/inference-engine/samples/speech_sample/fileutils.cpp
@ -108,15 +108,18 @@ void NumpyFile::GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, u
    cnpy::npz_t my_npz1 = cnpy::npz_load(fileName);
    auto it = my_npz1.begin();
    std::advance(it, numArrayToFindSize);
+    if (it != my_npz1.end()) {
+        numArrays = my_npz1.size();
+        cnpy::NpyArray my_npy = it->second;
+        numMemoryBytes = my_npy.data_holder->size();

-    numArrays = my_npz1.size();
-    cnpy::NpyArray my_npy = it->second;
-    numMemoryBytes = my_npy.data_holder->size();
-
-    if (ptrNumArrays != NULL)
-        *ptrNumArrays = numArrays;
-    if (ptrNumMemoryBytes != NULL)
-        *ptrNumMemoryBytes = numMemoryBytes;
+        if (ptrNumArrays != NULL)
+            *ptrNumArrays = numArrays;
+        if (ptrNumMemoryBytes != NULL)
+            *ptrNumMemoryBytes = numMemoryBytes;
+    } else {
+        throw std::runtime_error(std::string("Failed to get info %s  GetFileInfo()!\n") + fileName);
+    }
 }

 void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
@ -124,16 +127,20 @@ void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string&
    cnpy::npz_t my_npz1 = cnpy::npz_load(fileName);
    auto it = my_npz1.begin();
    std::advance(it, arrayIndex);
-    ptrName = it->first;
-    cnpy::NpyArray my_npy = it->second;
-    *ptrNumRows = my_npy.shape[0];
-    *ptrNumColumns = my_npy.shape[1];
+    if (it != my_npz1.end()) {
+        ptrName = it->first;
+        cnpy::NpyArray my_npy = it->second;
+        *ptrNumRows = my_npy.shape[0];
+        *ptrNumColumns = my_npy.shape[1];

-    for (size_t i = 0; i < my_npy.data_holder->size(); i++) {
-        memory.at(i) = my_npy.data_holder->at(i);
+        for (size_t i = 0; i < my_npy.data_holder->size(); i++) {
+            memory.at(i) = my_npy.data_holder->at(i);
+        }
+
+        *ptrNumBytesPerElement = sizeof(float);
+    } else {
+        throw std::runtime_error(std::string("Failed to open %s for reading in LoadFile()!\n") + fileName);
    }
-
-    *ptrNumBytesPerElement = sizeof(float);
 }

 void NumpyFile::SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) {
--- a/inference-engine/src/auto_plugin/auto_exec_network.cpp
+++ b/inference-engine/src/auto_plugin/auto_exec_network.cpp
@ -3,10 +3,8 @@
 //

 #include <string>
-#include <vector>
 #include <memory>
 #include <map>
-#include <unordered_map>

 #include "ie_metric_helpers.hpp"
 #include "auto_exec_network.hpp"
@ -15,8 +13,8 @@
 namespace AutoPlugin {
 using namespace InferenceEngine;

-AutoExecutableNetwork::AutoExecutableNetwork(const SoExecutableNetworkInternal& network) :
-    _network(network) {
+AutoExecutableNetwork::AutoExecutableNetwork(const SoExecutableNetworkInternal& network, bool enablePerfCount) :
+    _network(network), _enablePerfCount(enablePerfCount) {
 }

 AutoExecutableNetwork::~AutoExecutableNetwork() = default;
@ -24,7 +22,7 @@ AutoExecutableNetwork::~AutoExecutableNetwork() = default;
 InferenceEngine::IInferRequestInternal::Ptr AutoExecutableNetwork::CreateInferRequestImpl(InputsDataMap networkInputs,
                                                                                          OutputsDataMap networkOutputs) {
    SoIInferRequestInternal inferRequest = {_network, _network->CreateInferRequest()};
-    return std::make_shared<AutoInferRequest>(_networkInputs, _networkOutputs, inferRequest);
+    return std::make_shared<AutoInferRequest>(_networkInputs, _networkOutputs, inferRequest, _enablePerfCount);
 }

 void AutoExecutableNetwork::Export(std::ostream& networkModel) {
--- a/inference-engine/src/auto_plugin/auto_exec_network.hpp
+++ b/inference-engine/src/auto_plugin/auto_exec_network.hpp
@ -19,16 +19,11 @@ namespace AutoPlugin {

 using DeviceName = std::string;

-struct DeviceInformation {
-    DeviceName deviceName;
-    std::map<std::string, std::string> config;
-};
-
 class AutoExecutableNetwork : public InferenceEngine::IExecutableNetworkInternal {
 public:
    using Ptr = std::shared_ptr<AutoExecutableNetwork>;

-    explicit AutoExecutableNetwork(const InferenceEngine::SoExecutableNetworkInternal& network);
+    explicit AutoExecutableNetwork(const InferenceEngine::SoExecutableNetworkInternal& network, bool enablePerfCount);

    void Export(std::ostream& networkModel) override;
    InferenceEngine::RemoteContext::Ptr GetContext() const override;
@ -43,6 +38,7 @@ public:

 private:
    InferenceEngine::SoExecutableNetworkInternal _network;
+    bool _enablePerfCount;
 };

 }  // namespace AutoPlugin
--- a/inference-engine/src/auto_plugin/auto_infer_request.cpp
+++ b/inference-engine/src/auto_plugin/auto_infer_request.cpp
@ -11,13 +11,23 @@ namespace AutoPlugin {

 AutoInferRequest::AutoInferRequest(const InputsDataMap&              networkInputs,
                                   const OutputsDataMap&             networkOutputs,
-                                   const SoIInferRequestInternal&    inferRequest)
+                                   const SoIInferRequestInternal&    inferRequest,
+                                   bool                              enablePerfCount)
    : IInferRequestInternal(networkInputs, networkOutputs)
-    , _inferRequest(inferRequest) {
+    , _inferRequest(inferRequest)
+    , _enablePerfCount(enablePerfCount) {
 }

 std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> AutoInferRequest::GetPerformanceCounts() const {
-    return _inferRequest->GetPerformanceCounts();
+    if (_enablePerfCount) {
+        try {
+            return _inferRequest->GetPerformanceCounts();
+        } catch (...) {
+            return {};
+        }
+    } else {
+        return {};
+    }
 }

 void AutoInferRequest::InferImpl() {
--- a/inference-engine/src/auto_plugin/auto_infer_request.hpp
+++ b/inference-engine/src/auto_plugin/auto_infer_request.hpp
@ -24,7 +24,8 @@ public:
    using Ptr = std::shared_ptr<AutoInferRequest>;
    explicit AutoInferRequest(const InferenceEngine::InputsDataMap&             networkInputs,
                              const InferenceEngine::OutputsDataMap&            networkOutputs,
-                              const InferenceEngine::SoIInferRequestInternal&   inferRequest);
+                              const InferenceEngine::SoIInferRequestInternal&   inferRequest,
+                              bool                                              enablePerfCount);
    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> GetPerformanceCounts() const override;
    void InferImpl() override;
    void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) override;
@ -37,6 +38,7 @@ public:

 private:
    InferenceEngine::SoIInferRequestInternal _inferRequest;
+    bool                                     _enablePerfCount;
 };

 }  // namespace AutoPlugin
--- a/inference-engine/src/auto_plugin/auto_plugin.cpp
+++ b/inference-engine/src/auto_plugin/auto_plugin.cpp
@ -75,11 +75,11 @@ IE::QueryNetworkResult AutoInferencePlugin::QueryNetwork(const IE::CNNNetwork& n
    }

    auto fullConfig = mergeConfigs(_config, config);
-    auto metaDevices = GetDeviceChoice(fullConfig);
+    auto metaDevices = GetDeviceList(fullConfig);
    std::unordered_set<std::string> supportedLayers;
    for (auto&& value : metaDevices) {
        try {
-            auto deviceQr = GetCore()->QueryNetwork(network, value.deviceName, value.config);
+            auto deviceQr = GetCore()->QueryNetwork(network, value, {});
            std::unordered_set<std::string> deviceSupportedLayers;
            for (auto &&layerQr : deviceQr.supportedLayersMap) {
                deviceSupportedLayers.emplace(layerQr.first);
@ -111,7 +111,19 @@ IE::Parameter AutoInferencePlugin::GetConfig(const std::string& name,

 void AutoInferencePlugin::SetConfig(const ConfigType& config) {
    for (auto && kvp : config) {
-        _config[kvp.first] = kvp.second;
+        if (kvp.first.find("AUTO_") == 0) {
+            _config[kvp.first] = kvp.second;
+        } else if (kvp.first == IE::PluginConfigParams::KEY_PERF_COUNT) {
+            if (kvp.second == IE::PluginConfigParams::YES ||
+                kvp.second == IE::PluginConfigParams::NO) {
+                _config[kvp.first] = kvp.second;
+            } else {
+                IE_THROW() << "Unsupported config value: " << kvp.second
+                           << " for key: " << kvp.first;
+            }
+        } else {
+            IE_THROW() << "Unsupported config key: " << kvp.first;
+        }
    }
 }

@ -128,7 +140,10 @@ IE::Parameter AutoInferencePlugin::GetMetric(const std::string& name,
        std::string device_name = {"Inference Engine AUTO device"};
        IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, device_name);
    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
-        std::vector<std::string> configKeys;
+        std::vector<std::string> configKeys = {
+            IE::KEY_AUTO_DEVICE_LIST,
+            IE::PluginConfigParams::KEY_PERF_COUNT
+        };
        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
    } else if (name == METRIC_KEY(OPTIMIZATION_CAPABILITIES)) {
        std::vector<std::string> capabilities = GetOptimizationCapabilities(options);
@ -139,42 +154,21 @@ IE::Parameter AutoInferencePlugin::GetMetric(const std::string& name,
 }

 //////////////////////////////////// private & protected functions ///////////////////
-std::vector<AutoPlugin::DeviceInformation> AutoInferencePlugin::GetDeviceChoice(const ConfigType&  config) const {
-    std::vector<DeviceInformation> metaDevices;
-    std::vector<std::string> availableDevices;
+std::vector<DeviceName> AutoInferencePlugin::GetDeviceList(const ConfigType& config) const {
+    std::vector<DeviceName> deviceList;

    auto deviceListConfig = config.find(IE::KEY_AUTO_DEVICE_LIST);
    if (deviceListConfig == config.end()) {
-        availableDevices = GetCore()->GetAvailableDevices();
+        deviceList = GetCore()->GetAvailableDevices();
    } else {
-        availableDevices = IE::DeviceIDParser::getHeteroDevices(deviceListConfig->second);
+        deviceList = IE::DeviceIDParser::getHeteroDevices(deviceListConfig->second);
    }

-    auto getDeviceConfig = [&] (const DeviceName & deviceWithID) {
-        IE::DeviceIDParser deviceParser(deviceWithID);
-        std::string deviceName = deviceParser.getDeviceName();
-        ConfigType tconfig = config;
-
-        // set device ID if any
-        std::string deviceIDLocal = deviceParser.getDeviceID();
-        if (!deviceIDLocal.empty()) {
-            tconfig[IE::PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal;
-        }
-
-        return GetSupportedConfig(tconfig, deviceName);
-    };
-
-    for (auto && d : availableDevices) {
-        if (d != _pluginName) {
-            metaDevices.push_back({ d, getDeviceConfig(d)});
-        }
-    }
-
-    if (metaDevices.empty()) {
+    if (deviceList.empty()) {
        IE_THROW() << "Please, check environment due to no supported devices can be used";
    }

-    return metaDevices;
+    return deviceList;
 }

 std::vector<std::string> AutoInferencePlugin::GetOptimizationCapabilities(const std::map<std::string, IE::Parameter> & options) const {
@ -215,7 +209,21 @@ ConfigType AutoInferencePlugin::GetSupportedConfig(const ConfigType&  config,
    return supportedConfig;
 }

-DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector<DeviceInformation>& metaDevices, const std::string& networkPrecision) {
+void AutoInferencePlugin::CheckConfig(const ConfigType& config) {
+    std::vector<std::string> supportedConfigKeys = GetMetric(METRIC_KEY(SUPPORTED_CONFIG_KEYS), {});
+    for (auto&& c : config) {
+        auto itKey = std::find(supportedConfigKeys.begin(), supportedConfigKeys.end(), c.first);
+        if (supportedConfigKeys.end() == itKey) {
+            // CVS-57233
+            if (c.first.find("AUTO_") == 0) {
+                continue;
+            }
+            IE_THROW() << "AUTO plugin doesn't support config key " << c.first;
+        }
+    }
+}
+
+DeviceName AutoInferencePlugin::SelectDevice(const std::vector<DeviceName>& metaDevices, const std::string& networkPrecision) {
    if (metaDevices.empty()) {
        IE_THROW(NotFound) << "No available device to select in AUTO plugin";
    }
@ -223,15 +231,15 @@ DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector<DeviceInfo
        return metaDevices.at(0);
    }

-    std::vector<DeviceInformation> CPU;
-    std::vector<DeviceInformation> GPU;
+    std::vector<DeviceName> CPU;
+    std::vector<DeviceName> GPU;

    for (auto& item : metaDevices) {
-        if (item.deviceName.find("CPU") == 0) {
+        if (item.find("CPU") == 0) {
            CPU.push_back(item);
            continue;
        }
-        if (item.deviceName.find("GPU") == 0) {
+        if (item.find("GPU") == 0) {
            GPU.push_back(item);
            continue;
        }
@ -242,10 +250,10 @@ DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector<DeviceInfo
    }

    // Sort GPU by name: GPU.2 > GPU.1 > GPU.0 > GPU, so we always choose the GPU[0] as best device
-    std::sort(GPU.begin(), GPU.end(), [](const DeviceInformation& a, const DeviceInformation& b)->bool{return b.deviceName < a.deviceName;});
+    std::sort(GPU.begin(), GPU.end(), [](const DeviceName& a, const DeviceName& b)->bool{return b < a;});

    for (auto&& item : GPU) {
-        std::vector<std::string> capability = GetCore()->GetMetric(item.deviceName, METRIC_KEY(OPTIMIZATION_CAPABILITIES));
+        std::vector<std::string> capability = GetCore()->GetMetric(item, METRIC_KEY(OPTIMIZATION_CAPABILITIES));
        auto res = std::find(capability.begin(), capability.end(), networkPrecision);
        if (res != capability.end()) {
            return item;
--- a/inference-engine/src/auto_plugin/auto_plugin.hpp
+++ b/inference-engine/src/auto_plugin/auto_plugin.hpp
@ -30,10 +30,11 @@ public:
    void SetConfig(const ConfigType& config) override;

 private:
-    std::vector<AutoPlugin::DeviceInformation> GetDeviceChoice(const ConfigType&  config) const;
+    std::vector<DeviceName> GetDeviceList(const ConfigType&  config) const;
    std::vector<std::string> GetOptimizationCapabilities(const std::map<std::string, IE::Parameter>& options) const;
-    DeviceInformation SelectDevice(const std::vector<DeviceInformation>& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32));
-    ConfigType GetSupportedConfig(const ConfigType& config, const AutoPlugin::DeviceName & deviceName) const;
+    DeviceName SelectDevice(const std::vector<DeviceName>& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32));
+    ConfigType GetSupportedConfig(const ConfigType& config, const DeviceName & deviceName) const;
+    void CheckConfig(const ConfigType& config);
    static ConfigType mergeConfigs(ConfigType config, const ConfigType& local);

    template <typename T>
@ -41,18 +42,21 @@ private:
        if (GetCore() == nullptr) {
            IE_THROW() << "Please, work with AUTO device via InferencEngine::Core object";
        }
+
+        CheckConfig(config);
+
        auto fullConfig = mergeConfigs(_config, config);
-        auto metaDevices = GetDeviceChoice(fullConfig);
-        DeviceInformation selectedDevice;
+        auto metaDevices = GetDeviceList(fullConfig);
+        DeviceName selectedDevice;
        IE::SoExecutableNetworkInternal executableNetwork;
        while (!metaDevices.empty()) {
            selectedDevice = SelectDevice(metaDevices, networkPrecision);
            try {
-                executableNetwork = GetCore()->LoadNetwork(param, selectedDevice.deviceName, selectedDevice.config);
+                executableNetwork = GetCore()->LoadNetwork(param, selectedDevice, {});
                break;
            } catch (...) {
                auto eraseDevice = std::find_if(metaDevices.begin(), metaDevices.end(),
-                    [=](const DeviceInformation& d)->bool{return d.deviceName == selectedDevice.deviceName;});
+                    [=](const DeviceName& d)->bool{return d == selectedDevice;});
                if (eraseDevice == metaDevices.end()) {
                    IE_THROW() << "Didn't find the selected device name";
                }
@ -63,7 +67,10 @@ private:
        if (!executableNetwork) {
            IE_THROW() << "Failed to load network by AUTO plugin";
        }
-        auto impl = std::make_shared<AutoExecutableNetwork>(executableNetwork);
+
+        bool enablePerfCount = fullConfig.find(IE::PluginConfigParams::KEY_PERF_COUNT) != fullConfig.end();
+
+        auto impl = std::make_shared<AutoExecutableNetwork>(executableNetwork, enablePerfCount);

        if (std::is_same<std::string, T>::value) {
            SetExeNetworkInfo(impl, executableNetwork->GetInputsInfo(),
--- a/inference-engine/src/cldnn_engine/cldnn_config.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@ -5,6 +5,7 @@
 #include <sys/stat.h>

 #include <cldnn/cldnn_config.hpp>
+#include <gpu/gpu_config.hpp>
 #include "cldnn_config.h"
 #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
 #include "ie_api.h"
@ -39,6 +40,7 @@ static void createDirectory(std::string _path) {
    }
 }

+IE_SUPPRESS_DEPRECATED_START
 void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Config::UpdateFromMap");
    for (auto& kvp : configMap) {
@ -69,7 +71,8 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY) == 0 ||
+                   key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) {
            std::stringstream ss(val);
            uint32_t uVal(0);
            ss >> uVal;
@ -93,7 +96,8 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
                    IE_THROW(ParameterMismatch) << "Unsupported queue priority value: " << uVal;
            }

-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE) == 0 ||
+                   key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) {
            std::stringstream ss(val);
            uint32_t uVal(0);
            ss >> uVal;
@ -205,7 +209,8 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS) == 0 ||
+                   key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) {
            if (val.compare(PluginConfigParams::YES) == 0) {
                nv12_two_inputs = true;
            } else if (val.compare(PluginConfigParams::NO) == 0) {
@ -221,7 +226,7 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_MAX_NUM_THREADS) == 0) {
            int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
            try {
                int val_i = std::stoi(val);
@ -231,17 +236,17 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
                    n_threads = val_i;
                }
            } catch (const std::exception&) {
-                IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val
+                IE_THROW() << "Wrong value for property key " << GPUConfigParams::KEY_GPU_MAX_NUM_THREADS << ": " << val
                                   << "\nSpecify the number of threads use for build as an integer."
                                   << "\nOut of range value will be set as a default value, maximum concurrent threads.";
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING) == 0) {
            if (val.compare(PluginConfigParams::YES) == 0) {
                enable_loop_unrolling = true;
            } else if (val.compare(PluginConfigParams::NO) == 0) {
                enable_loop_unrolling = false;
            } else {
-                IE_THROW(ParameterMismatch) << "Unsupported KEY_CLDNN_ENABLE_LOOP_UNROLLING flag value: " << val;
+                IE_THROW(ParameterMismatch) << "Unsupported KEY_GPU_ENABLE_LOOP_UNROLLING flag value: " << val;
            }
        } else {
            IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
@ -297,6 +302,7 @@ void Config::adjustKeyMapValues() {
        default: break;
        }
        key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY] = qp;
+        key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY] = qp;
    }
    {
        std::string qt = "0";
@ -307,6 +313,7 @@ void Config::adjustKeyMapValues() {
        default: break;
        }
        key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE] = qt;
+        key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE] = qt;
    }
    {
        std::string tm = PluginConfigParams::TUNING_DISABLED;
@ -328,11 +335,13 @@ void Config::adjustKeyMapValues() {
    key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
    key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
    key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
-    key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads);
+    key_config_map[GPUConfigParams::KEY_GPU_MAX_NUM_THREADS] = std::to_string(n_threads);

    if (enable_loop_unrolling)
-        key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES;
+        key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES;
    else
-        key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO;
+        key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO;
 }
+IE_SUPPRESS_DEPRECATED_END
+
 }  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -79,7 +79,7 @@
 #include "cldnn_executable_network.h"
 #include "cldnn_custom_layer.h"
 #include "cldnn_itt.h"
-#include "cldnn/cldnn_config.hpp"
+#include "gpu/gpu_config.hpp"

 #ifdef __linux__
 # include <dlfcn.h>
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@ -16,7 +16,6 @@
 #include "cldnn_itt.h"

 #include <description_buffer.hpp>
-#include <cldnn/cldnn_config.hpp>
 #include "cldnn_infer_request.h"
 #include <threading/ie_executor_manager.hpp>
 #include "cldnn_async_infer_request.h"
--- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
@ -16,7 +16,6 @@
 #include "cldnn_graph.h"
 #include "simple_math.h"
 #include <description_buffer.hpp>
-#include <cldnn/cldnn_config.hpp>
 #include "cldnn_infer_request.h"
 #include <threading/ie_executor_manager.hpp>
 #include <fstream>
--- a/inference-engine/src/cldnn_engine/ops/reduce.cpp
+++ b/inference-engine/src/cldnn_engine/ops/reduce.cpp
@ -18,6 +18,7 @@

 #include "api/reduce.hpp"
 #include "api/reorder.hpp"
+#include "api/reshape.hpp"

 namespace CLDNNPlugin {

@ -78,6 +79,28 @@ void CreateReduceOp(Program& p, const std::shared_ptr<ngraph::Node>& op, cldnn::

    p.AddPrimitive(reducePrim);

+    auto resultLayerName = layerName;
+    auto out_dims = op->get_output_shape(0).size();
+    if (out_dims == 3 && !keep_dims && rank >= 4) {
+        resultLayerName = layerName + "_reshape";
+        auto out_shape = op->get_output_shape(0);
+        cldnn::tensor outTensor;
+        switch (rank) {
+            case 6:
+                outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]),
+                                          1, TensorValue(out_shape[2]), 1, 1);
+            case 5:
+                outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]),
+                                          1, TensorValue(out_shape[2]), 1);
+            case 4:
+                outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]),
+                                          1, TensorValue(out_shape[2]));
+        }
+        auto reshape_prim = cldnn::reshape(resultLayerName, layerName, outTensor);
+        p.AddPrimitive(reshape_prim);
+        p.AddPrimitiveToProfiler(op, resultLayerName);
+    }
+
    auto reorderLayerName = layerName + "_reorder";
    cldnn::format out_format = cldnn::format::any;
    auto out_dt = DataTypeFromPrecision(op->get_output_element_type(0));
@ -89,7 +112,7 @@ void CreateReduceOp(Program& p, const std::shared_ptr<ngraph::Node>& op, cldnn::
        else if (rank - rawAxes.size() <= 4)
            out_format = cldnn::format::bfyx;

-        auto reorder_prim = cldnn::reorder(reorderLayerName, layerName, out_format, out_dt);
+        auto reorder_prim = cldnn::reorder(reorderLayerName, resultLayerName, out_format, out_dt);
        p.AddPrimitive(reorder_prim);
        p.AddPrimitiveToProfiler(op, reorderLayerName);
    } else {
--- a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
@ -550,7 +550,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(std::istream&
    this->SetPointerToPlugin(_heteroPlugin->shared_from_this());
 }

-void HeteroExecutableNetwork::ExportImpl(std::ostream& heteroModel) {
+void HeteroExecutableNetwork::Export(std::ostream& heteroModel) {
    pugi::xml_document doc;
    auto heteroNode = doc.append_child("hetero");
    heteroNode.append_attribute("name").set_value(_name.c_str());
--- a/inference-engine/src/hetero_plugin/hetero_executable_network.hpp
+++ b/inference-engine/src/hetero_plugin/hetero_executable_network.hpp
@ -56,7 +56,7 @@ public:

    InferenceEngine::Parameter GetMetric(const std::string &name) const override;

-    void ExportImpl(std::ostream& modelFile) override;
+    void Export(std::ostream& modelFile) override;

 private:
    void InitCNNImpl(const InferenceEngine::CNNNetwork&    network);
--- a/inference-engine/src/hetero_plugin/hetero_plugin.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_plugin.cpp
@ -57,13 +57,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(cons
    return std::make_shared<HeteroExecutableNetwork>(network, mergeConfigs(_config, config), this);
 }

-InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetworkImpl(std::istream& heteroModel, const Configs& config) {
-    if (GetCore() == nullptr) {
-        IE_THROW() << "Please, work with HETERO device via InferencEngine::Core object";
-    }
-
-    return std::make_shared<HeteroExecutableNetwork>(heteroModel,
-        mergeConfigs(_config, config), this);
+InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(std::istream& heteroModel, const std::map<std::string, std::string>& config) {
+    return std::make_shared<HeteroExecutableNetwork>(heteroModel, mergeConfigs(_config, config), this);
 }

 Engine::Configs Engine::GetSupportedConfig(const Engine::Configs& config, const std::string & deviceName) const {
--- a/inference-engine/src/hetero_plugin/hetero_plugin.hpp
+++ b/inference-engine/src/hetero_plugin/hetero_plugin.hpp
@ -37,10 +37,11 @@ public:
    InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string,
                                         InferenceEngine::Parameter> & options) const override;

-    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& heteroModel, const Configs& config) override;
+    InferenceEngine::IExecutableNetworkInternal::Ptr
+    ImportNetwork(std::istream& heteroModel, const std::map<std::string, std::string>& config) override;

    DeviceMetaInformationMap GetDevicePlugins(const std::string& targetFallback,
-        const Configs & localConfig) const;
+                                              const Configs & localConfig) const;

 private:
    Configs GetSupportedConfig(const Configs& config, const std::string & deviceName) const;
--- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp
+++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp
@ -49,19 +49,17 @@ std::shared_ptr<IInferRequestInternal> IExecutableNetworkInternal::CreateInferRe
 }

 void IExecutableNetworkInternal::Export(const std::string& modelFileName) {
-    // we need to write to stringstream first
-    // because in case of exception in ExportImpl the file is not created
-    std::stringstream strm;
-    ExportImpl(strm);
-    std::ofstream(modelFileName.c_str()) << strm.rdbuf();
+    std::ofstream modelFile(modelFileName, std::ios::out | std::ios::binary);
+
+    if (modelFile.is_open()) {
+        Export(modelFile);
+    } else {
+        IE_THROW() << "The " << modelFileName << " file can not be opened for Export";
+    }
 }

 void IExecutableNetworkInternal::Export(std::ostream& networkModel) {
-    std::stringstream strm;
-    strm.write(exportMagic.data(), exportMagic.size());
-    strm << _plugin->GetName() << std::endl;
-    ExportImpl(strm);
-    networkModel << strm.rdbuf();
+    IE_THROW(NotImplemented);
 }

 CNNNetwork IExecutableNetworkInternal::GetExecGraphInfo() {
@ -97,7 +95,4 @@ std::shared_ptr<IInferRequestInternal> IExecutableNetworkInternal::CreateInferRe
    IE_THROW(NotImplemented);
 }

-void IExecutableNetworkInternal::ExportImpl(std::ostream&) {
-    IE_THROW(NotImplemented);
-}
 }  // namespace InferenceEngine
--- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp
+++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp
@ -16,24 +16,12 @@
 #include <blob_factory.hpp>

 #include <istream>
+#include <fstream>
 #include <map>
 #include <memory>
 #include <string>

 namespace InferenceEngine {
-namespace {
-void parsePluginName(std::istream& networkModel) {
-    ExportMagic magic = {};
-    auto currentPos = networkModel.tellg();
-    networkModel.read(magic.data(), magic.size());
-    auto exportedWithName = (exportMagic == magic);
-    if (exportedWithName) {
-        networkModel.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
-    } else {
-        networkModel.seekg(currentPos, networkModel.beg);
-    }
-}
-}  // namespace

 PreProcessInfo copyPreProcess(const PreProcessInfo& from) {
    PreProcessInfo to = from;
@ -170,22 +158,26 @@ RemoteContext::Ptr IInferencePlugin::GetDefaultContext(const ParamMap&) {
    IE_THROW(NotImplemented);
 }

-std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(const std::string&,
-                                                                            const std::map<std::string, std::string>&) {
-    IE_THROW(NotImplemented);
+std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(const std::string& modelFileName,
+                                                                            const std::map<std::string, std::string>& config) {
+    std::ifstream blobFile(modelFileName, std::ios::binary);
+
+    if (!blobFile.is_open()) {
+        IE_THROW(NetworkNotRead);
+    }
+
+    return ImportNetwork(blobFile, config);
 }

 std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(std::istream& networkModel,
                                                                            const std::map<std::string, std::string>& config) {
-    parsePluginName(networkModel);
-    return ImportNetworkImpl(networkModel, config);
+    IE_THROW(NotImplemented);
 }

 std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(std::istream& networkModel,
                                                                            const std::shared_ptr<RemoteContext>& context,
                                                                            const std::map<std::string, std::string>& config) {
-    parsePluginName(networkModel);
-    return ImportNetworkImpl(networkModel, context, config);
+   IE_THROW(NotImplemented);
 }

 void IInferencePlugin::SetCore(ICore* core) {
@ -213,17 +205,6 @@ std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::LoadExeNetworkImpl
    IE_THROW(NotImplemented);
 }

-std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetworkImpl(std::istream&,
-                                                                                const std::map<std::string, std::string>&) {
-    IE_THROW(NotImplemented);
-}
-
-std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetworkImpl(std::istream&,
-                                                                                const std::shared_ptr<RemoteContext>&,
-                                                                                const std::map<std::string, std::string>&) {
-    IE_THROW(NotImplemented);
-}
-
 void IInferencePlugin::SetExeNetworkInfo(const std::shared_ptr<IExecutableNetworkInternal>& exeNetwork,
                                         const ConstInputsDataMap& inputs,
                                         const ConstOutputsDataMap& outputs) {
--- a/inference-engine/src/inference_engine/ie_core.cpp
+++ b/inference-engine/src/inference_engine/ie_core.cpp
@ -395,6 +395,7 @@ public:
        opsetNames.insert("opset4");
        opsetNames.insert("opset5");
        opsetNames.insert("opset6");
+        opsetNames.insert("opset7");
    }

    ~Impl() override = default;
@ -566,18 +567,6 @@ public:
    SoExecutableNetworkInternal ImportNetwork(std::istream& networkModel, const std::string& deviceName,
                                              const std::map<std::string, std::string>& config) override {
        auto parsed = parseDeviceNameIntoConfig(deviceName, config);
-
-        if (parsed._deviceName.empty()) {
-            ExportMagic magic = {};
-            auto currentPos = networkModel.tellg();
-            networkModel.read(magic.data(), magic.size());
-            auto exportedWithName = (exportMagic == magic);
-            if (exportedWithName) {
-                std::getline(networkModel, parsed._deviceName);
-            }
-            networkModel.seekg(currentPos, networkModel.beg);
-        }
-
        return GetCPPPluginByName(parsed._deviceName).ImportNetwork(networkModel, parsed._config);
    }

@ -1022,18 +1011,6 @@ void Core::AddExtension(const IExtensionPtr& extension) {
 ExecutableNetwork Core::ImportNetwork(const std::string& modelFileName, const std::string& deviceName,
                                      const std::map<std::string, std::string>& config) {
    OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork");
-
-    // TODO: remove once NotImplemented exception is deprecated and not used
-    if (deviceName.find("HETERO") == 0) {
-        IE_THROW() << "HETERO device does not support ImportNetwork";
-    }
-    if (deviceName.find("MULTI") == 0) {
-        IE_THROW() << "MULTI device does not support ImportNetwork";
-    }
-    if (deviceName.find("AUTO") == 0) {
-        IE_THROW() << "AUTO device does not support ImportNetwork";
-    }
-
    auto parsed = parseDeviceNameIntoConfig(deviceName, config);
    auto exec = _impl->GetCPPPluginByName(parsed._deviceName).ImportNetwork(modelFileName, parsed._config);
    return { exec, exec };
@ -1041,10 +1018,33 @@ ExecutableNetwork Core::ImportNetwork(const std::string& modelFileName, const st

 ExecutableNetwork Core::ImportNetwork(std::istream& networkModel, const std::string& deviceName,
                                      const std::map<std::string, std::string>& config) {
+    OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork");
    auto exec = _impl->ImportNetwork(networkModel, deviceName, config);
    return { exec, exec };
 }

+ExecutableNetwork Core::ImportNetwork(std::istream& networkModel) {
+    OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork");
+
+    using ExportMagic = std::array<char, 4>;
+    constexpr static const ExportMagic exportMagic = {{0x1, 0xE, 0xE, 0x1}};
+
+    std::string deviceName;
+    ExportMagic magic = {};
+    auto currentPos = networkModel.tellg();
+    networkModel.read(magic.data(), magic.size());
+    if (exportMagic == magic) {
+        std::getline(networkModel, deviceName);
+    } else {
+        IE_THROW() << "Passed compiled stream does not contain device name. "
+            "Please, provide device name manually";
+    }
+    networkModel.seekg(currentPos, networkModel.beg);
+
+    auto exec = _impl->GetCPPPluginByName(deviceName).ImportNetwork(networkModel, {});
+    return { exec, exec };
+}
+
 ExecutableNetwork Core::ImportNetwork(std::istream& networkModel,
                                      const RemoteContext::Ptr& context,
                                      const std::map<std::string, std::string>& config) {
@ -1124,8 +1124,8 @@ Parameter Core::GetConfig(const std::string& deviceName, const std::string& name
            IE_THROW()
                << "You can only GetConfig of the AUTO itself (without devices). "
                   "GetConfig is also possible for the individual devices before creating the AUTO on top.";
-      }
-  }
+        }
+    }

    auto parsed = parseDeviceNameIntoConfig(deviceName);

--- a/inference-engine/src/inference_engine/ie_transformations.cpp
+++ b/inference-engine/src/inference_engine/ie_transformations.cpp
@ -11,6 +11,16 @@ using namespace InferenceEngine;
 void InferenceEngine::LowLatency(InferenceEngine::CNNNetwork &network) {
    auto function = network.getFunction();
    ngraph::pass::Manager manager;
+    NGRAPH_SUPPRESS_DEPRECATED_START
    manager.register_pass<ngraph::pass::LowLatency>();
+    NGRAPH_SUPPRESS_DEPRECATED_END
+    manager.run_passes(function);
+}
+
+void InferenceEngine::lowLatency2(InferenceEngine::CNNNetwork &network,
+                                  bool use_const_initializer) {
+    auto function = network.getFunction();
+    ngraph::pass::Manager manager;
+    manager.register_pass<ngraph::pass::LowLatency2>(use_const_initializer);
    manager.run_passes(function);
 }
--- a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
+++ b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
@ -244,6 +244,9 @@ CNNLayer::Ptr createSubGraphLayer(const std::shared_ptr<ngraph::Node>& layer) {
    LayerParams params = {layer->get_friendly_name(), "TensorIterator",
                          details::convertPrecision(layer->get_output_element_type(0))};
    auto res = std::make_shared<InferenceEngine::TensorIterator>(params);
+    if (res == nullptr) {
+        IE_THROW() << "Can't create TensorIterator";
+    }
    res->body = body;

    // Port map: outputs
--- a/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp
+++ b/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp
@ -9,7 +9,6 @@
 #include <vector>

 #include <ngraph/opsets/opset1.hpp>
-
 #include <legacy/ngraph_ops/crop_ie.hpp>
 #include <ngraph/rt_info.hpp>

@ -137,7 +136,6 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher
                    lb = std::min(static_cast<int64_t>(input_shape[input_shape_idx]), lb);
                    ub = std::min(static_cast<int64_t>(input_shape[input_shape_idx]), ub);

-                    offset.emplace_back(lb);

                    // set default value for stride or use given value
                    int64_t stride = 1;
@ -153,6 +151,7 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher
                            ub = -1;

                        lb = std::min(lb, static_cast<int64_t>(input_shape[input_shape_idx]) - 1);
+                        offset.emplace_back(lb);
                        lb -= 1;  // we always get 1st element, so we need decrease range
                        if (ub <= lb)
                            dimension = (ub - lb) / stride + 1;
@ -160,12 +159,16 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher
                        // apply masks
                        if (begin_mask.count(axis))
                            lb = 0;
-                        if (end_mask.count(axis))
+                        offset.emplace_back(lb);
+
+                        if (end_mask.count(axis)) {
                            ub = static_cast<int64_t>(input_shape[input_shape_idx]);
+                        }

                        lb += 1;  // we always get 1st element, so we need decrease range
-                        if (ub >= lb)
+                        if (ub >= lb) {
                            dimension = (ub - lb) / stride + 1;
+                        }
                    }

                    dim.emplace_back(dimension);
--- a/inference-engine/src/low_precision_transformations/src/concat.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat.cpp
@ -43,19 +43,21 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat
        return false;
    }

-    // precisions can be different
+    // Concat operations precision is defined:
+    // 1. consumers after Concat
+    // 2. FakeQuantize precisions without zero point
    ngraph::Node& quantizationLayer = *subgraph.quantizationLayers[0];
    std::shared_ptr<ngraph::opset1::FakeQuantize> fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(quantizationLayer.shared_from_this());
    if (!NetworkHelper::isQuantizeSupported(fq)) {
        return false;
    }
-
-    std::vector<element::Type> concatParentsChildrensPrecisions = precisionsOnActivations;
-    fillAvailablePrecisions(subgraph.quantizationLayers[0], concatParentsChildrensPrecisions);
-    if (concatParentsChildrensPrecisions.empty()) {
+    DataPrecision dataPrecision = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
+    if (dataPrecision.precision == ngraph::element::undefined) {
        return false;
    }

+    std::vector<element::Type> concatChildrenPrecisions = precisionsOnActivations;
+
    for (size_t i = 0; i < subgraph.quantizationLayers.size(); ++i) {
        fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(subgraph.quantizationLayers[i]);
        if (fq == nullptr) {
@ -72,20 +74,28 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat
        if (quantizationDetails.inputHighValues.size() != 1ul) {
            return false;
        }
-        std::vector<element::Type> fqChildrensPrecisions = precisionsOnActivations;
-        fillAvailablePrecisions(subgraph.quantizationLayers[i], fqChildrensPrecisions);
-        concatParentsChildrensPrecisions = NetworkHelper::precisionIntersection(concatParentsChildrensPrecisions, fqChildrensPrecisions);

-        if (concatParentsChildrensPrecisions.empty()) {
+        // define concatenation operation consumers precisions
+        std::vector<element::Type> fqChildrenPrecisions = precisionsOnActivations;
+        fillAvailablePrecisions(subgraph.quantizationLayers[i], fqChildrenPrecisions);
+        concatChildrenPrecisions = NetworkHelper::precisionIntersection(concatChildrenPrecisions, fqChildrenPrecisions);
+        if (concatChildrenPrecisions.empty()) {
            return false;
        }
+
+        // define FakeQuantize precisions without zero point
+        const DataPrecision dataPrecision2 = getDataPrecision(subgraph.quantizationLayers[i]->shared_from_this(), quantizationDetails, false);
+        if (dataPrecision2.precision == ngraph::element::undefined) {
+            return false;
+        }
+
+        if (dataPrecision.precision != dataPrecision2.precision) {
+            dataPrecision = dataPrecision.precision.is_signed() ? dataPrecision : dataPrecision2;
+        }
    }

-    DataPrecision dataPrecision;
-    if (std::find(concatParentsChildrensPrecisions.begin(), concatParentsChildrensPrecisions.end(), element::i8) != concatParentsChildrensPrecisions.end()) {
-        dataPrecision = DataPrecision(element::i8);
-    } else {
-        dataPrecision = DataPrecision(concatParentsChildrensPrecisions[0]);
+    if (std::find(concatChildrenPrecisions.begin(), concatChildrenPrecisions.end(), dataPrecision.precision) == concatChildrenPrecisions.end()) {
+        dataPrecision = DataPrecision(concatChildrenPrecisions[0]);
    }

    std::vector<QuantizationDetails> quantizationLayersDetails;
--- a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
@ -64,14 +64,23 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context

    DataPrecision dataPrecision;
    {
+        std::vector<element::Type> concatChildrenPrecisions = precisionsOnActivations;
        for (auto quantizationLayer : subgraph.quantizationLayers) {
            std::shared_ptr<ngraph::opset1::FakeQuantize> fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(quantizationLayer->shared_from_this());
            if (!NetworkHelper::isQuantizeSupported(fq)) {
                return false;
            }

-            const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
+            // define concatenation operation consumers precisions
+            std::vector<element::Type> fqChildrenPrecisions = precisionsOnActivations;
+            fillAvailablePrecisions(quantizationLayer, fqChildrenPrecisions);
+            concatChildrenPrecisions = NetworkHelper::precisionIntersection(concatChildrenPrecisions, fqChildrenPrecisions);
+            if (concatChildrenPrecisions.empty()) {
+                return false;
+            }

+            // define FakeQuantize precisions without zero point
+            const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
            if (dataPrecision.precision == ngraph::element::undefined) {
                dataPrecision = tmp;
                continue;
@ -81,6 +90,10 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context
                dataPrecision = tmp;
            }
        }
+
+        if (std::find(concatChildrenPrecisions.begin(), concatChildrenPrecisions.end(), dataPrecision.precision) == concatChildrenPrecisions.end()) {
+            dataPrecision = DataPrecision(concatChildrenPrecisions[0]);
+        }
    }

    for (size_t i = 0; i < subgraph.quantizationLayers.size(); ++i) {
--- a/inference-engine/src/mkldnn_plugin/config.h
+++ b/inference-engine/src/mkldnn_plugin/config.h
@ -4,9 +4,11 @@

 #pragma once

+#include <threading/ie_istreams_executor.hpp>
+#include "utils/debug_capabilities.h"
+
 #include <string>
 #include <map>
-#include <threading/ie_istreams_executor.hpp>

 namespace MKLDNNPlugin {

@ -35,6 +37,10 @@ struct Config {
    bool manualEnforceBF16 = false;
 #endif

+#ifdef CPU_DEBUG_CAPS
+    DebugCaps::Config debugCaps;
+#endif
+
    void readProperties(const std::map<std::string, std::string> &config);
    void updateProperties();
    std::map<std::string, std::string> _config;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
@ -32,7 +32,7 @@ bool MKLDNNEdge::isUseExternalMemory() const {
    return externalMemoryPtr;
 }

-bool MKLDNNEdge::isDropped() {
+bool MKLDNNEdge::isDropped() const {
    bool not_in_parent = true;
    bool not_in_child = true;

@ -124,6 +124,10 @@ void MKLDNNEdge::reuse(MKLDNNMemoryPtr ptr) {
    status = Status::Allocated;
 }

+const InferenceEngine::TensorDesc& MKLDNNEdge::getInputDescRO() const {
+    return inputDesc;
+}
+
 InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() {
    if (inputDesc.getLayout() == InferenceEngine::Layout::ANY) {
        inputDesc = getSpecifiedInputDesc({});
@ -131,6 +135,10 @@ InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() {
    return inputDesc;
 }

+const InferenceEngine::TensorDesc& MKLDNNEdge::getOutputDescRO() const {
+    return outputDesc;
+}
+
 InferenceEngine::TensorDesc MKLDNNEdge::getOutputDesc() {
    if (outputDesc.getLayout() == InferenceEngine::Layout::ANY) {
        outputDesc = getSpecifiedOutputDesc({});
@ -145,11 +153,11 @@ InferenceEngine::TensorDesc MKLDNNEdge::getDesc() {
    return getInputDesc();
 }

-int MKLDNNEdge::getInputNum() {
+int MKLDNNEdge::getInputNum() const {
    return parent_port;
 }

-int MKLDNNEdge::getOutputNum() {
+int MKLDNNEdge::getOutputNum() const {
    return child_port;
 }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
@ -61,11 +61,11 @@ public:
    MKLDNNMemoryPtr& getMemoryPtr();

    bool needReorder();
-    bool isDropped();
+    bool isDropped() const;
    bool isUseExternalMemory() const;

-    int getInputNum();
-    int getOutputNum();
+    int getInputNum() const;
+    int getOutputNum() const;

    void setChildPort(const size_t port) { child_port = port; }

@ -73,10 +73,12 @@ public:
    MKLDNNEdgePtr getSharedEdge() const;
    MKLDNNEdgePtr getSharedEdge(std::nothrow_t) const;

+    const InferenceEngine::TensorDesc& getInputDescRO() const;
+    const InferenceEngine::TensorDesc& getOutputDescRO() const;
+
 private:
    std::string name();

-private:
    std::weak_ptr<MKLDNNNode> parent;
    std::weak_ptr<MKLDNNNode> child;
    int parent_port;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -78,7 +78,10 @@ void MKLDNNGraph::CreateGraph(NET &net, const MKLDNNExtensionManager::Ptr& extMg

    Replicate(net, extMgr);
    InitGraph();
+
    status = Ready;
+
+    ENABLE_CPU_DEBUG_CAP(serialize(*this));
 }

 template void MKLDNNGraph::CreateGraph(const std::shared_ptr<const ngraph::Function>&,
@ -344,10 +347,6 @@ void MKLDNNGraph::InitGraph() {
        graphNode->cleanup();
    }
 #endif
-
-#if !defined(NDEBUG) && defined(PRINT_GRAPH_INFO)
-    printGraphInfo();
-#endif
    ExecuteConstantNodesOnly();
 }

@ -809,7 +808,7 @@ void MKLDNNGraph::Infer(MKLDNNInferRequest* request, int batch) {

    mkldnn::stream stream(eng);

-    ENABLE_CPU_DEBUG_CAP(NodeDumper nd(infer_count));
+    ENABLE_CPU_DEBUG_CAP(NodeDumper nd(config.debugCaps, infer_count));

    for (int i = 0; i < graphNodes.size(); i++) {
        if (request != nullptr) {
@ -954,6 +953,10 @@ void MKLDNNGraph::setConfig(const Config &cfg) {
    config = cfg;
 }

+const Config& MKLDNNGraph::getConfig() const {
+    return config;
+}
+
 void MKLDNNGraph::setProperty(const std::map<std::string, std::string>& properties) {
    config.readProperties(properties);
 }
@ -1217,21 +1220,3 @@ void MKLDNNGraph::EnforceBF16() {
 InferenceEngine::CNNNetwork MKLDNNGraph::dump() const {
    return dump_graph_as_ie_ngraph_net(*this);
 }
-
-void MKLDNNGraph::printGraphInfo() const {
-    for (auto &graphNode : graphNodes) {
-        std::cout << "name: " << graphNode->getName() << " [ ";
-        if (graphNode->parentEdges.size() > 0) {
-            auto prnt_out_desc = graphNode->parentEdges[0].lock()->getOutputDesc();
-            std::cout << "in: " << prnt_out_desc.getPrecision().name()
-                      << "/l=" << prnt_out_desc.getLayout()
-                      << "; ";
-        }
-        if (graphNode->childEdges.size() > 0) {
-            auto chld_in_desc = graphNode->childEdges[0].lock()->getInputDesc();
-            std::cout << "out: " << chld_in_desc.getPrecision().name()
-                      << "/l=" << chld_in_desc.getLayout();
-        }
-        std::cout << " ]"  << std::endl;
-    }
-}
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@ -39,6 +39,8 @@ public:
    }

    void setConfig(const Config &cfg);
+    const Config& getConfig() const;
+
    void setProperty(const std::map<std::string, std::string> &properties);
    Config getProperty() const;

@ -59,6 +61,10 @@ public:

    void Infer(MKLDNNInferRequest* request = nullptr, int batch = -1);

+    const std::vector<MKLDNNNodePtr>& GetNodes() const {
+        return graphNodes;
+    }
+
    std::vector<MKLDNNNodePtr>& GetNodes() {
        return graphNodes;
    }
@ -219,7 +225,6 @@ protected:

 private:
    void EnforceBF16();
-    void printGraphInfo() const;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
@ -5,9 +5,11 @@
 #include "mkldnn_graph_dumper.h"
 #include <ie_ngraph_utils.hpp>
 #include "exec_graph_info.hpp"
+#include "ie_common.h"
 #include "mkldnn_debug.h"
 #include <ngraph/variant.hpp>
 #include "ngraph/ngraph.hpp"
+#include "utils/debug_capabilities.h"

 #include <vector>
 #include <string>
@ -18,6 +20,9 @@ using namespace InferenceEngine;

 namespace MKLDNNPlugin {

+void serializeToCout(const MKLDNNGraph &graph);
+void serializeToXML(const MKLDNNGraph &graph, const std::string& path);
+
 namespace {

 std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &node) {
@ -207,4 +212,46 @@ InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph
    return net;
 }

+#ifdef CPU_DEBUG_CAPS
+void serialize(const MKLDNNGraph &graph) {
+    const std::string& path = graph.getConfig().debugCaps.execGraphPath;
+
+    if (path.empty())
+        return;
+
+    if (path == "cout")
+        serializeToCout(graph);
+    else if (!path.compare(path.size() - 4, 4, ".xml"))
+        serializeToXML(graph, path);
+    else
+        IE_THROW() << "Unknown serialize format. Should be either 'cout' or '*.xml'. Got " << path;
+}
+
+void serializeToXML(const MKLDNNGraph &graph, const std::string& path) {
+    if (path.empty())
+        return;
+
+    graph.dump().serialize(path);
+}
+
+void serializeToCout(const MKLDNNGraph &graph) {
+    for (const auto& node : graph.GetNodes()) {
+        std::cout << "name: " << node->getName() << " [ ";
+        if (!node->getParentEdges().empty()) {
+            const auto& parentEdge = *(node->getParentEdges()[0].lock());
+            const auto& prnt_out_desc = parentEdge.getOutputDescRO();
+            std::cout << "in: " << prnt_out_desc.getPrecision().name()
+                      << "/l=" << prnt_out_desc.getLayout()
+                      << "; ";
+        }
+        if (!node->getChildEdges().empty()) {
+            const auto& childEdge = *(node->getChildEdges()[0].lock());
+            const auto& chld_in_desc = childEdge.getInputDescRO();
+            std::cout << "out: " << chld_in_desc.getPrecision().name()
+                      << "/l=" << chld_in_desc.getLayout();
+        }
+        std::cout << " ]"  << std::endl;
+    }
+}
+#endif
 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
@ -6,11 +6,14 @@

 #include "cpp/ie_cnn_network.h"
 #include "mkldnn_graph.h"
+#include "utils/debug_capabilities.h"

 #include <memory>

 namespace MKLDNNPlugin {

 InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);
-
+#ifdef CPU_DEBUG_CAPS
+void serialize(const MKLDNNGraph &graph);
+#endif // CPU_DEBUG_CAPS
 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp
@ -22,7 +22,11 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {

    ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) {
        auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
+        if (!fc)
+            return false;
        auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(fc->get_input_node_shared_ptr(0));
+        if (!reshape)
+            return false;

        // Check that Reshape reshapes 4D tensor to 2D or input shape = output shape
        auto shape_in = reshape->input_value(0).get_shape();
@ -67,6 +71,8 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
                                                                        fc->input_value(2),
                                                                        outShape,
                                                                        fc->output(0).get_element_type());
+        } else {
+            return false;
        }
        new_ops.push_back(new_fc);
        new_fc->set_friendly_name(fc->get_friendly_name());
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
@ -60,6 +60,8 @@ MKLDNNPlugin::ReshapeFullyConnected::ReshapeFullyConnected() {
                                                                        fc->input_value(2),
                                                                        output_shape_new,
                                                                        fc->get_output_type());
+        } else {
+            return false;
        }
        new_ops.push_back(fc_new);

--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp
@ -20,8 +20,16 @@ MKLDNNPlugin::ReshapePRelu::ReshapePRelu() {
        if (!prelu || ngraph::shape_size(prelu->get_input_shape(1)) == 1 || prelu->get_input_shape(1).size() != 1) {
            return false;
        }
-        ngraph::Shape new_shape(prelu->input_value(0).get_shape().size(), 1);
-        new_shape[new_shape.size() > 1 ? 1 : 0] = prelu->input_value(1).get_shape()[0];
+        const auto prelu_shape = prelu->input_value(0).get_shape();
+        const auto slope_shape = prelu->input_value(1).get_shape();
+        ngraph::Shape new_shape(prelu_shape.size(), 1);
+        const auto slope_dim = slope_shape[0];
+        const auto channel_dim_idx = prelu_shape.size() > 1 ? 1 : 0;
+        if (slope_dim != prelu_shape[channel_dim_idx]) {
+            return false;
+        }
+        new_shape[channel_dim_idx] = slope_dim;
+
        auto slope = ngraph::op::util::reshapeTo(prelu->input_value(1), new_shape);
        auto new_prelu = std::make_shared<ngraph::opset1::PRelu>(prelu->input(0).get_source_output(), slope);
        new_prelu->set_friendly_name(prelu->get_friendly_name());
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp
@ -42,6 +42,8 @@ MKLDNNConvertNode::MKLDNNConvertNode(const InferenceEngine::SizeVector &dims, co
    addOriginalInputPrecision(inPrc);
    outDims.emplace_back(dims);
    addOriginalOutputPrecision(outPrc);
+
+    errorPrefix = "Convert node with name '" + getName() + "'";
 }

 void MKLDNNConvertNode::getSupportedDescriptors() {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp
@ -58,6 +58,8 @@ MKLDNNDepthToSpaceNode::MKLDNNDepthToSpaceNode(const std::shared_ptr<ngraph::Nod
        if (blockSize == 0)
            THROW_ERROR << "has incorrect block_size parameter is zero!";

+        size_t nSpatialDims = inDims[0].ndims() - 2;
+        blockStep = static_cast<size_t>(std::pow(blockSize, nSpatialDims));
    } else {
        IE_THROW(NotImplemented) << errorMessage;
    }
@ -74,14 +76,13 @@ void MKLDNNDepthToSpaceNode::getSupportedDescriptors() {
    if (srcDims.size() != dstDims.size())
        THROW_ERROR << "has incorrect number of input/output dimensions";

-    size_t nSpatialDims = srcDims.size() - 2;
-    blockStep = static_cast<size_t>(std::pow(blockSize, nSpatialDims));
    if (srcDims[1] % blockStep)
        THROW_ERROR << "has block_size parameter which is incompatible with input tensor channels dimension size";

    if (srcDims[1] / blockStep != dstDims[1])
        THROW_ERROR << "has incompatible input/output channels";

+    size_t nSpatialDims = srcDims.size() - 2;
    for (size_t i = 0; i < nSpatialDims; ++i) {
        if (srcDims[i + 2] * blockSize != dstDims[i + 2])
            THROW_ERROR << "has incompatible spatial dims";
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp
@ -49,7 +49,7 @@ MKLDNNDFTNode::MKLDNNDFTNode(const std::shared_ptr<ngraph::Node>& op, const mkld

    /* Data */
    inputShape = inDims[DATA_INDEX].ToSizeVector();
-    if (inputShape.size() < 1) {
+    if (inputShape.size() < 2) {
        IE_THROW() << layerErrorPrefix << " has invalid 'data' input tensor with rank: " << inputShape.size();
    }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h
@ -32,7 +32,7 @@ private:
    size_t dataTypeSize_;
    int strideAxDst_;
    int dstAxDim_;
-    int strideAx1Diff_;
+    int strideAx1Diff_ = 0;
    std::string errorPrefix_;

    template <typename dataType>
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_strided_slice_node.cpp
@ -86,7 +86,7 @@ MKLDNNStridedSliceNode::MKLDNNStridedSliceNode(const std::shared_ptr<ngraph::Nod

 void MKLDNNStridedSliceNode::getSupportedDescriptors() {
    auto isConstantNode = [](const MKLDNNNodePtr &node) {
-        return node->isConstant() && node->getType() == Input;
+        return node->getType() == Input && node->isConstant();
    };

    params.parametersAreConstant = isConstantNode(getParentEdgesAtPort(BEGIN_ID)[0]->getParent()) &&
@ -138,7 +138,11 @@ void MKLDNNStridedSliceNode::getSupportedDescriptors() {

    if (params.parametersAreConstant) {
        auto fillingInParameters = [&](std::vector<int> &parameter, const size_t type, const size_t size, const int value) {
-            auto blob = std::dynamic_pointer_cast<MKLDNNInputNode>(getParentEdgesAtPort(type)[0]->getParent())->getMemoryPtr();
+            const auto constNode = std::dynamic_pointer_cast<MKLDNNInputNode>(getParentEdgesAtPort(type)[0]->getParent());
+            if (!constNode) {
+                THROW_ERROR << "can't cast node on " << type << " port to MKLDNNInputNode";
+            }
+            auto blob = constNode->getMemoryPtr();
            if (blob->GetDataType() != mkldnn::memory::data_type::s32)
                THROW_ERROR << "supports only parameters input with precision I32";
            const int *ptr = static_cast<const int*>(blob->GetPtr());
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp
@ -136,6 +136,9 @@ public:
    void execute(mkldnn::stream strm, int n_iter) override {
        auto mem = mem_holder_dst;
        auto data_ptr = static_cast<uint32_t*>(mem.get_data_handle());
+        if (data_ptr == nullptr) {
+            IE_THROW() << "TensorIterator node has not allocated memory for IterCountPortHelper";
+        }
        *data_ptr = n_iter;
    }
 };
@ -150,6 +153,9 @@ public:

    int getStatus() override {
        auto data_ptr = static_cast<uint8_t*>(mem_holder.get_data_handle());
+        if (data_ptr == nullptr) {
+            IE_THROW() << "TensorIterator node has not allocated memory for asBoolCheck";
+        }
        return *data_ptr == static_cast<uint8_t>(0) ? 0 : 1;
    }
 };
@ -164,6 +170,9 @@ public:

    int getStatus() override {
        auto data_ptr = static_cast<uint32_t*>(mem_holder.get_data_handle());
+        if (data_ptr == nullptr) {
+            IE_THROW() << "TensorIterator node has not allocated memory for asIntCheck";
+        }
        return *data_ptr;
    }
 };
@ -283,6 +292,9 @@ MKLDNNTensorIteratorNode::MKLDNNTensorIteratorNode(const std::shared_ptr<ngraph:

 void MKLDNNTensorIteratorNode::getSupportedDescriptors() {
    auto tiOp = std::dynamic_pointer_cast<ngraph::op::util::SubGraphOp>(ngraphOp);
+    if (tiOp == nullptr) {
+        IE_THROW() << "Can't cast TensorIterator node with name: " << getName() << " to ngraph::op::util::SubGraphOp";
+    }
    const std::shared_ptr<const ngraph::Function> body = tiOp->get_function();
    sub_graph.CreateGraph(body, ext_mng, weightCache);

--- a/inference-engine/src/mkldnn_plugin/utils/README.md
+++ b/inference-engine/src/mkldnn_plugin/utils/README.md
@ -71,3 +71,22 @@ Example:
 ```sh
    OV_CPU_BLOB_DUMP_NODE_NAME=".+" binary ...
 ```
+
+## Graph serialization
+The functionality allows to serialize execution graph using environment variable:
+```sh
+    OV_CPU_EXEC_GRAPH_PATH=<path> binary ...
+```
+
+Possible serialization options:
+* cout
+
+    Serialize to console output
+* \<path\>.xml
+
+    Serialize graph into .xml and .bin files. Can be opened using, for example, *netron* app
+* \<path\>.dot
+
+    TBD. Serialize graph into .dot file. Can be inspected using, for example, *graphviz* tools.
+
+
--- a/inference-engine/src/mkldnn_plugin/utils/debug_capabilities.h
+++ b/inference-engine/src/mkldnn_plugin/utils/debug_capabilities.h
@ -4,7 +4,44 @@
 #pragma once

 #ifdef CPU_DEBUG_CAPS
-#   define ENABLE_CPU_DEBUG_CAP(_x) _x;
-#else
-#   define ENABLE_CPU_DEBUG_CAP(_x)
-#endif
+
+#include <map>
+#include <string>
+#include <vector>
+
+#define ENABLE_CPU_DEBUG_CAP(_x) _x;
+
+namespace MKLDNNPlugin {
+namespace DebugCaps {
+
+class Config {
+public:
+    Config() {
+        readParam(blobDumpDir, "OV_CPU_BLOB_DUMP_DIR");
+        readParam(blobDumpFormat, "OV_CPU_BLOB_DUMP_FORMAT");
+        readParam(blobDumpNodeExecId, "OV_CPU_BLOB_DUMP_NODE_EXEC_ID");
+        readParam(blobDumpNodeType, "OV_CPU_BLOB_DUMP_NODE_TYPE");
+        readParam(blobDumpNodeName, "OV_CPU_BLOB_DUMP_NODE_NAME");
+        readParam(execGraphPath, "OV_CPU_EXEC_GRAPH_PATH");
+    }
+
+    std::string blobDumpDir;
+    std::string blobDumpFormat;
+    std::string blobDumpNodeExecId;
+    std::string blobDumpNodeType;
+    std::string blobDumpNodeName;
+    std::string execGraphPath;
+
+private:
+    void readParam(std::string& param, const char* envVar) {
+        if (const char* envValue = std::getenv(envVar))
+            param = envValue;
+    }
+};
+
+} // namespace DebugCaps
+} // namespace MKLDNNPlugin
+
+#else // !CPU_DEBUG_CAPS
+#define ENABLE_CPU_DEBUG_CAP(_x)
+#endif // CPU_DEBUG_CAPS
--- a/inference-engine/src/mkldnn_plugin/utils/node_dumper.cpp
+++ b/inference-engine/src/mkldnn_plugin/utils/node_dumper.cpp
@ -6,9 +6,10 @@
 #include "node_dumper.h"

 #include "mkldnn_node.h"
-#include "utils/blob_dump.h"
-
 #include "ie_common.h"
+#include "utils/blob_dump.h"
+#include "utils/debug_capabilities.h"
+
 #include <array>
 #include <regex>
 #include <sstream>
@ -18,27 +19,24 @@ using namespace InferenceEngine;

 namespace MKLDNNPlugin {

-NodeDumper::NodeDumper(int _count):
-    count(_count), dumpFormat(DUMP_FORMAT::BIN) {
-    const char* dumpDirEnv = std::getenv("OV_CPU_BLOB_DUMP_DIR");
-    if (dumpDirEnv)
-        dumpDirName = dumpDirEnv;
+NodeDumper::NodeDumper(const DebugCaps::Config& config, const int _count)
+    : dumpFormat(DUMP_FORMAT::BIN)
+    , dumpDirName("mkldnn_dump")
+    , count(_count) {
+    if (!config.blobDumpDir.empty())
+        dumpDirName = config.blobDumpDir;

-    const char* dumpFormatEnv = std::getenv("OV_CPU_BLOB_DUMP_FORMAT");
-    if (dumpFormatEnv)
-        dumpFormat = parseDumpFormat(dumpFormatEnv);
+    if (!config.blobDumpFormat.empty())
+        dumpFormat = parseDumpFormat(config.blobDumpFormat);

-    const char* filter = std::getenv("OV_CPU_BLOB_DUMP_NODE_EXEC_ID");
-    if (filter)
-        dumpFilters[FILTER::BY_EXEC_ID] = filter;
+    if (!config.blobDumpNodeExecId.empty())
+        dumpFilters[FILTER::BY_EXEC_ID] = config.blobDumpNodeExecId;

-    filter = std::getenv("OV_CPU_BLOB_DUMP_NODE_TYPE");
-    if (filter)
-        dumpFilters[FILTER::BY_TYPE] = filter;
+    if (!config.blobDumpNodeType.empty())
+        dumpFilters[FILTER::BY_TYPE] = config.blobDumpNodeType;

-    filter = std::getenv("OV_CPU_BLOB_DUMP_NODE_NAME");
-    if (filter)
-        dumpFilters[FILTER::BY_NAME] = filter;
+    if (!config.blobDumpNodeName.empty())
+        dumpFilters[FILTER::BY_NAME] = config.blobDumpNodeName;
 }

 void NodeDumper::dumpInputBlobs(const MKLDNNNodePtr& node) const {
--- a/inference-engine/src/mkldnn_plugin/utils/node_dumper.h
+++ b/inference-engine/src/mkldnn_plugin/utils/node_dumper.h
@ -6,6 +6,7 @@

 #include "mkldnn_node.h"
 #include "utils/blob_dump.h"
+#include "utils/debug_capabilities.h"

 #include <unordered_map>
 #include <string>
@ -22,7 +23,7 @@ namespace MKLDNNPlugin {
 */
 class NodeDumper {
 public:
-    NodeDumper(int _count);
+    NodeDumper(const DebugCaps::Config& config, const int _count);

    void dumpInputBlobs(const MKLDNNNodePtr &node) const;
    void dumpOutputBlobs(const MKLDNNNodePtr &node) const;
@ -41,11 +42,9 @@ private:
    void formatNodeName(std::string& name) const;

    DUMP_FORMAT dumpFormat;
-
+    std::string dumpDirName;
    int count;

-    std::string dumpDirName = "mkldnn_dump";
-
    enum FILTER {
        BY_EXEC_ID,
        BY_TYPE,
--- a/inference-engine/src/offline_transformations/include/mask_attribute.hpp
+++ b/inference-engine/src/offline_transformations/include/mask_attribute.hpp
@ -54,10 +54,90 @@ public:
                           });
    }

+    std::vector<size_t> get_not_empty_dims() {
+        std::vector<size_t> not_empty_dims;
+        for (size_t i = 0; i < this->size(); i++) {
+            if (!this->at(i).empty())
+                not_empty_dims.push_back(i);
+        }
+        return not_empty_dims;
+    }
+
    bool is_shape_like() const { return m_is_shape_like; }

    void set_shape_like(bool flag) { m_is_shape_like = flag; }

+    void copy_value_from_mask(Mask *const mask) {
+        auto cur_mask_iter = begin();
+        auto mask_iter = mask->begin();
+        while (cur_mask_iter != end() && mask_iter != mask->end()) {
+            *cur_mask_iter = *mask_iter;
+
+            cur_mask_iter++;
+            mask_iter++;
+        }
+    }
+
+    void copy_value_from_mask_reversed(Mask *const mask) {
+        auto cur_mask_iter = rbegin();
+        auto mask_iter = mask->rbegin();
+        while (cur_mask_iter != rend() && mask_iter != mask->rend()) {
+            *cur_mask_iter = *mask_iter;
+
+            cur_mask_iter++;
+            mask_iter++;
+        }
+    }
+
+    Mask::Ptr intersect_masks_reversed(Mask *const mask) {
+        auto result_mask = std::make_shared<Mask>(std::max(size(), mask->size()));
+        auto result_iter = result_mask->rbegin();
+        auto mask_1_iter = rbegin();
+        auto mask_2_iter = mask->rbegin();
+
+        while (mask_1_iter != rend() &&
+               mask_2_iter != mask->rend()) {
+            // Merge mask dimension values for both masks
+            // Example: (MaskValue[1,2,3,4], MaskValue[2,3]) -> MaskValue[2,3]
+            for (const auto & value : *mask_1_iter) {
+                if (mask_2_iter->count(value)) {
+                    result_iter->insert(value);
+                }
+            }
+
+            result_iter++;
+            mask_1_iter++;
+            mask_2_iter++;
+        }
+        return result_mask;
+    }
+
+    Mask::Ptr union_masks_reversed(Mask *const mask) {
+        auto result_mask = std::make_shared<Mask>(std::max(size(), mask->size()));
+        auto result_iter = result_mask->rbegin();
+        auto mask_1_iter = rbegin();
+        auto mask_2_iter = mask->rbegin();
+
+        while (mask_1_iter != rend() &&
+               mask_2_iter != mask->rend()) {
+            // Union mask dimension values for both masks
+            // Example: (MaskValue[1,2,3,4], MaskValue[2, 5]) -> MaskValue[1, 2, 3, 4, 5]
+            for (const auto & value : *mask_1_iter) {
+                result_iter->insert(value);
+            }
+            for (const auto & value : *mask_2_iter) {
+                if (!result_iter->count(value)) {
+                    result_iter->insert(value);
+                }
+            }
+
+            result_iter++;
+            mask_1_iter++;
+            mask_2_iter++;
+        }
+        return result_mask;
+    }
+
    void add_callback(const std::function<bool(Mask::Ptr)> & receive_callback, Mask::Ptr mask) {
        m_callbacks[mask.get()] = receive_callback;
        m_dependencies.push_back(mask.get());
--- a/inference-engine/src/offline_transformations/include/pruning.hpp
+++ b/inference-engine/src/offline_transformations/include/pruning.hpp
@ -14,6 +14,7 @@ namespace ngraph {
 namespace pass {

 class InitConstMask;
+class InitMasks;
 class PropagateMasks;
 class ShrinkWeights;

@ -22,6 +23,16 @@ class Pruning;
 } // namespace pass
 } // namespace ngraph

+/**
+ * @ingroup ie_transformation_common_api
+ * @brief Initialising masks for pruned operations
+ */
+class ngraph::pass::InitMasks : public ngraph::pass::GraphRewrite {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    InitMasks();
+};
+
 /**
 * @ingroup ie_transformation_common_api
 * @brief Check Constant operation values by given dimensions and set
--- a/inference-engine/src/offline_transformations/src/pruning/init_const_mask.cpp
+++ b/inference-engine/src/offline_transformations/src/pruning/init_const_mask.cpp
@ -17,7 +17,7 @@ NGRAPH_RTTI_DEFINITION(ngraph::pass::InitConstMask, "InitConstMask", 0);
 ngraph::pass::InitConstMask::InitConstMask(const ngraph::AxisSet & dims,
                                           const std::function<bool(const double & value)> & condition) {
    auto constant = pattern::wrap_type<opset6::Constant>(
-            pattern::type_matches_any({element::f16, element::f32, element::f64}));
+            pattern::type_matches_any({element::i8, element::u8, element::f16, element::f32, element::f64}));

    matcher_pass_callback callback = [=](pattern::Matcher& m) {
        auto const_node = std::dynamic_pointer_cast<opset6::Constant>(m.get_match_root());
--- a/inference-engine/src/offline_transformations/src/pruning/init_masks.cpp
+++ b/inference-engine/src/offline_transformations/src/pruning/init_masks.cpp
@ -0,0 +1,64 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "pruning.hpp"
+#include "mask_attribute.hpp"
+
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/opsets/opset6.hpp>
+#include <ngraph/log.hpp>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::InitMasks, "InitMasks", 0);
+
+namespace ngraph {
+namespace pass {
+namespace init_masks {
+
+class InitConvMask;
+
+} // namespace init_masks
+} // namespace pass
+} // namespace ngraph
+
+class ngraph::pass::init_masks::InitConvMask : public MatcherPass {
+public:
+    InitConvMask() {
+        auto input = pattern::any_input();
+        auto weights = pattern::any_input();
+        auto conv = pattern::wrap_type<opset6::Convolution, opset6::GroupConvolution>({input, weights});
+
+        ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+            const auto & pattern_map = m.get_pattern_value_map();
+            const auto & m_output = pattern_map.at(conv);
+
+            // Initializing weights mask:
+            // 1. Looking for Const node with weights
+            NodeVector weights_calculation_nodes;
+            auto cur_node = m_output.get_node()->get_input_node_shared_ptr(1);
+
+            while (!ngraph::is_type<opset6::Constant>(cur_node) && cur_node->inputs().size()) {
+                weights_calculation_nodes.push_back(cur_node);
+                cur_node = cur_node->get_input_node_shared_ptr(0);
+            }
+            if (!ngraph::is_type<opset6::Constant>(cur_node)) {
+                NGRAPH_DEBUG << "Can't find Constant weights for Convolution: " <<
+                m_output.get_node()->get_friendly_name() << std::endl;
+                return false;
+            }
+
+            // 2. Init mask for Const node
+            InitConstMask({0}/* check only output channels dim */).apply(cur_node);
+            return true;
+        };
+
+        auto m = std::make_shared<ngraph::pattern::Matcher>(conv, "ConvolutionInitMask");
+        register_matcher(m, callback);
+    }
+};
+
+
+ngraph::pass::InitMasks::InitMasks() {
+    add_matcher<init_masks::InitConvMask>();
+}
+
--- a/inference-engine/src/offline_transformations/src/pruning/propagate_masks.cpp
+++ b/inference-engine/src/offline_transformations/src/pruning/propagate_masks.cpp
@ -7,7 +7,9 @@

 #include <ngraph/pattern/op/wrap_type.hpp>
 #include <ngraph/opsets/opset6.hpp>
+#include <ngraph/opsets/opset5.hpp>
 #include <ngraph/log.hpp>
+#include <ngraph/rt_info.hpp>

 NGRAPH_RTTI_DEFINITION(ngraph::pass::PropagateMasks, "PropagateMasks", 0);

@ -20,11 +22,23 @@ class GroupConvolution;
 class Elementwise;
 class PassThrough;
 class StopPropagation;
+class FakeQuantize;
+class Concat;
+class Reshape;

 } // namespace mask_propagation
 } // namespace pass
 } // namespace ngraph

+ngraph::Shape broadcast_shape_to_rank(ngraph::Shape shape_to_broadcast, int64_t dst_rank) {
+    auto initial_rank = static_cast<int64_t>(shape_to_broadcast.size());
+    auto num_of_broadcased_dims = dst_rank - initial_rank;
+    std::vector<size_t> dims(num_of_broadcased_dims, 1);
+    dims.insert(dims.end(), shape_to_broadcast.begin(), shape_to_broadcast.end());
+    auto new_shape = ngraph::Shape(dims);
+    return new_shape;
+}
+
 class ngraph::pass::mask_propagation::Convolution : public MatcherPass {
 public:
    Convolution() {
@ -38,12 +52,15 @@ public:
            const auto & m_output = pattern_map.at(conv);
            const auto & m_input = pattern_map.at(input);

-            // In case if weights are Constant we initialize Mask
-            InitConstMask({0}/* check only output channel */).apply(m_weights.get_node_shared_ptr());
-
            auto weights_mask = getMask(m_weights);
-            // If weights are not a Constant and we didn't set Mask value before we will get nullptr
-            if (!weights_mask) return false;
+
+            // Nullptr in weights-mask means that mask for this node wasn't initialized earlier.
+            // Weights mask for convolution should be initialized in the InitMasks pass (and propagate after it).
+            // If mask isn't initialized - this weights (and hence all convolution) can't be pruned for some reason.
+            if (!weights_mask) {
+                NGRAPH_DEBUG << "No weights mask for " << m_output.get_node()->get_friendly_name() << "\n";
+                return false;
+            }
            auto weights_mask_row = weights_mask.get();

            if (auto input_mask = getMask(m_input)) {
@ -119,9 +136,15 @@ public:

            auto weights_mask = getMask(m_weights);
            if (!weights_mask) {
-                // TODO: only if weights are constant
-                weights_mask = std::make_shared<Mask>(weights_shape.size());
-                setMask(m_weights, weights_mask);
+                // Setting mask only if weights are constant
+                if (ngraph::is_type<opset6::Constant>(m_output.get_node_shared_ptr())) {
+                    weights_mask = std::make_shared<Mask>(weights_shape.size());
+                    setMask(m_weights, weights_mask);
+                } else {
+                    NGRAPH_DEBUG << "GroupConvolution: No weights mask and weights aren't constant for " <<
+                    *m_output.get_node() << "\n";
+                    return false;
+                }
            }
            auto weights_mask_row = weights_mask.get();

@ -169,13 +192,85 @@ public:
    }
 };

+class ngraph::pass::mask_propagation::Reshape : public MatcherPass {
+public:
+    Reshape() {
+        auto input = pattern::any_input(pattern::has_static_shape());
+        auto shape = pattern::any_input();
+        // Working only for Reshapes on Group Convolution weights
+        auto reshape = pattern::wrap_type<opset6::Reshape>({input, shape}, pattern::consumers_count(1));
+        auto gconv = pattern::wrap_type<opset6::GroupConvolution>({pattern::any_input(), reshape},
+                                                                  pattern::has_static_shape());
+
+        ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+            const auto & pattern_map = m.get_pattern_value_map();
+            const auto & m_shape = pattern_map.at(shape);
+            const auto & m_output = pattern_map.at(reshape);
+            const auto & m_input = pattern_map.at(input);
+
+            auto shape_val = m_shape.get_node_shared_ptr();
+
+            // In Depthwise Convolutions Reshape on weights just add additional dimension for output channels count
+            // (1 in case of the depthwise) of kernel.
+            // Example: Reshape from [G, 1 (I), X, Y, Z] -> [G, 1 (O), 1 (I), X, Y, Z], where G - group numbers,
+            // X, Y, Z -  spartial dimensions (can be only X or X, Y), I, O - number of input/output channels of kernel.
+
+            // Checking that matched Reshape meets this conditions (add 1-d dim on 1 position of shape constant)
+            auto inp_shape = m_input.get_shape();
+            auto out_shape = m_output.get_shape();
+            inp_shape.insert(inp_shape.begin() + 1, 1);
+            if (inp_shape != out_shape) {
+                return false;
+            }
+
+            auto input_mask = getMask(m_input);
+            if (!input_mask) {
+                return false;
+            }
+            auto input_mask_row = input_mask.get();
+            auto output_mask = std::make_shared<Mask>(m_output.get_partial_shape().rank().get_length());
+            auto output_mask_row = output_mask.get();
+
+            // Depthwise Convolution pruned only by input channels (== groups) ->
+            // Propagating mask from Group (0) dim in Reshape input to Group (0) dim in Reshape output and back
+            input_mask->add_callback([output_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->at(0) = output_mask_row->at(0);
+                return true;
+            }, output_mask);
+            output_mask->add_callback([input_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->at(0) = input_mask_row->at(0);
+                return true;
+            }, input_mask);
+            input_mask->apply_callback(output_mask);
+
+            // To allow pruning on weights (allow reshape input Group (0) dim changing) replace Reshape Shape constant
+            // [G, 1, 1, X, Y, Z] by [-1, 1, 1, X, Y, Z].
+            auto old_shape_const = std::dynamic_pointer_cast<opset6::Constant>(m_shape.get_node_shared_ptr());
+            auto shape_value = old_shape_const.get()->cast_vector<int64_t>();
+            shape_value[0] = -1;
+            auto new_const = opset6::Constant::create(old_shape_const->get_element_type(),
+                                                      old_shape_const->get_shape(), shape_value);
+            new_const->set_friendly_name(old_shape_const->get_friendly_name());
+            ngraph::copy_runtime_info(old_shape_const, new_const);
+            ngraph::replace_node(old_shape_const, new_const);
+
+            setMask(m_output, output_mask);
+            return true;
+        };
+
+        auto m = std::make_shared<ngraph::pattern::Matcher>(reshape, "ReshapeMaskPropagation");
+        register_matcher(m, callback);
+    }
+};
+
 class ngraph::pass::mask_propagation::Elementwise : public MatcherPass {
 public:
    Elementwise() {
        auto input = pattern::any_input();
        auto weights = pattern::any_input();
-        auto eltwise = pattern::wrap_type<op::util::BinaryElementwiseArithmetic>({input, weights},
-                                                                                 pattern::has_static_rank());
+        auto eltwise = pattern::wrap_type<opset6::Add, opset6::Subtract, opset6::Maximum, opset6::Minimum,
+        opset6::Multiply>({input, weights}, pattern::has_static_rank());
+        // TODO: add Div, Power support

        ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
            const auto & pattern_map = m.get_pattern_value_map();
@ -183,82 +278,275 @@ public:
            const auto & m_output = pattern_map.at(eltwise);
            const auto & m_input = pattern_map.at(input);

-            // TODO: implement check that compares input shape ranks
+            // Case when input masks should be united instead of intersection
+            bool union_eltwise_type = ngraph::is_type<opset6::Multiply>(m_output.get_node_shared_ptr());
+
            const auto & input_rank = m_input.get_partial_shape().rank().get_length();
            const auto & weights_rank = m_weights.get_partial_shape().rank().get_length();
+            // Here assuming that masks can be propagated only through 3/4 dimensional tensors
+            // (since channel dim is necessary)
            if (weights_rank < 3 || input_rank < 3) return false;

-            // In case if one of the inputs is constant
-            // TODO: need to find channel dimension instead of hardcoded zero
-            const size_t & channel_dim = (input_rank == weights_rank ? 1 : 0);
-            InitConstMask({channel_dim}).apply(m_input.get_node_shared_ptr());
-            InitConstMask({channel_dim}).apply(m_weights.get_node_shared_ptr());
+            // In case if first of the inputs is constant
+            InitConstMask({0, 1/* potential output channel dim */}).apply(m_input.get_node_shared_ptr());
+            auto input_mask = getMask(m_input);
+            if (!input_mask) {
+                NGRAPH_DEBUG << "No input mask for: " << m_output.get_node()->get_friendly_name() << std::endl;
+                return false;
+            }
+
+            InitConstMask({0, 1}).apply(m_weights.get_node_shared_ptr());

            auto weights_mask = getMask(m_weights);
-            auto input_mask = getMask(m_input);
-
-            if (!weights_mask || !input_mask) {
-                NGRAPH_DEBUG << "No mask for: " << m_output.get_node()->get_friendly_name() << std::endl;
+            if (!weights_mask) {
+                NGRAPH_DEBUG << "No weights mask for: " << m_output.get_node()->get_friendly_name() << std::endl;
                return false;
            }
            auto input_mask_row = input_mask.get();
            auto weights_mask_row = weights_mask.get();

-            // Merge masks from two inputs
+            // Merging masks from two inputs
            auto output_mask = std::make_shared<Mask>(m_output.get_partial_shape().rank().get_length());
            auto output_mask_row = output_mask.get();

-            auto out_mask_callback = [input_mask_row, weights_mask_row](Mask::Ptr cur_mask) -> bool {
-                auto omask_iter = cur_mask->rbegin();
-                auto imask_iter = input_mask_row->rbegin();
-                auto wmask_iter = weights_mask_row->rbegin();
-
-                for (auto & item : *cur_mask) {
-                    item.clear();
-                }
-
-                while (imask_iter != input_mask_row->rend() &&
-                       wmask_iter != weights_mask_row->rend()) {
-                    // Merge mask dimension values for both masks
-                    // Example: (MaskValue[1,2,3,4], MaskValue[2,3]) -> MaskValue[2,3]
-                    for (const auto & value : *imask_iter) {
-                        if (wmask_iter->count(value)) {
-                            omask_iter->insert(value);
-                        }
-                    }
-
-                    omask_iter++;
-                    imask_iter++;
-                    wmask_iter++;
+            auto out_mask_callback = [input_mask_row, weights_mask_row, union_eltwise_type](Mask::Ptr cur_mask) -> bool {
+                Mask::Ptr result_mask;
+                if (union_eltwise_type) {
+                    result_mask = input_mask_row->union_masks_reversed(weights_mask_row);
+                } else {
+                    result_mask = input_mask_row->intersect_masks_reversed(weights_mask_row);
                }
+                cur_mask->copy_value_from_mask_reversed(result_mask.get());
                return true;
            };
            output_mask->add_callback(out_mask_callback, input_mask);
-            output_mask->add_callback(out_mask_callback, weights_mask);

-            auto callback = [output_mask_row](Mask::Ptr cur_mask) -> bool {
-                auto omask_iter = output_mask_row->rbegin();
-                auto cmask_iter = cur_mask->rbegin();
-                while (omask_iter != output_mask_row->rend() &&
-                       cmask_iter != cur_mask->rend()) {
-                    // TODO: check
-                    *cmask_iter = *omask_iter;
-
-                    omask_iter++;
-                    cmask_iter++;
-                }
+            input_mask->add_callback([weights_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->copy_value_from_mask_reversed(weights_mask_row);
                return true;
-            };
-            input_mask->add_callback(callback, output_mask);
-            weights_mask->add_callback(callback, output_mask);
+            }, weights_mask);
+            input_mask->add_callback([output_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->copy_value_from_mask_reversed(output_mask_row);
+                return true;
+            }, output_mask);
+            weights_mask->add_callback([input_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->copy_value_from_mask_reversed(input_mask_row);
+                return true;
+            }, input_mask);

-            // Init output mask
            output_mask->apply_callback(input_mask);
+            weights_mask->apply_callback(input_mask);
+
            setMask(m_output, output_mask);
            return true;
        };

-        auto m = std::make_shared<ngraph::pattern::Matcher>(eltwise, "EltwiseMaskPropagation");
+        auto m = std::make_shared<ngraph::pattern::Matcher>(eltwise, "ElementwiseMaskPropagation");
+        register_matcher(m, callback);
+    }
+};
+
+class ngraph::pass::mask_propagation::FakeQuantize : public MatcherPass{
+public:
+    FakeQuantize(){
+        auto input = pattern::any_input(pattern::has_static_shape());
+        auto input_low = pattern::any_input(pattern::has_static_shape());
+        auto input_high = pattern::any_input(pattern::has_static_shape());
+        auto output_low = pattern::any_input(pattern::has_static_shape());
+        auto output_high = pattern::any_input(pattern::has_static_shape());
+        auto fake_quantize = pattern::wrap_type<opset6::FakeQuantize>({input, input_low, input_high, output_low,
+                                                                            output_high});
+        ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+            const auto & pattern_map = m.get_pattern_value_map();
+            const auto & m_input = pattern_map.at(input);
+            const auto & m_input_low = pattern_map.at(input_low);
+            const auto & m_input_high = pattern_map.at(input_high);
+            const auto & m_output_low = pattern_map.at(output_low);
+            const auto & m_output_high = pattern_map.at(output_high);
+            const auto & m_output = pattern_map.at(fake_quantize);
+
+            auto input_mask = getMask(m_input);
+
+            // Input mask is the only source of pruning in FQ
+            if (!input_mask) {
+                NGRAPH_DEBUG << "FakeQuantize: No input mask for " << *m_output.get_node() << "\n";
+                return false;
+            }
+
+            auto input_mask_row = input_mask.get();
+
+            // Propagate input mask to output mask and in the opposite direction
+            auto output_mask = std::make_shared<Mask>(m_output.get_partial_shape().rank().get_length());
+            auto output_mask_row = output_mask.get();
+
+            // Output mask is equal to input mask
+            auto output_mask_callback = [input_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->copy_value_from_mask(input_mask_row);
+                return true;
+            };
+
+            auto input_mask_callback = [output_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->copy_value_from_mask(output_mask_row);
+                return true;
+            };
+
+            output_mask->add_callback(output_mask_callback, input_mask);
+            input_mask->add_callback(input_mask_callback, output_mask);
+
+            // Calculate output mask
+            output_mask->apply_callback(input_mask);
+            setMask(m_output, output_mask);
+
+            auto input_low_size = shape_size(m_input_low.get_shape());
+            auto input_high_size = shape_size(m_input_high.get_shape());
+            auto output_low_size = shape_size(m_output_low.get_shape());
+            auto output_high_size = shape_size(m_output_high.get_shape());
+
+            // In the per-tensor case FQ params shouldn't be pruned
+            if (input_low_size == 1 && output_low_size == 1 && input_high_size == 1 && output_high_size == 1) {
+                return true;
+            }
+
+            // If input/output ranges in FQ should be broadcasted to input shape -> broadcast this consant values
+            // for the convenience of working with the masks
+            NodeVector fq_params_nodes{m_input_low.get_node_shared_ptr(),
+                                                               m_input_high.get_node_shared_ptr(),
+                                                               m_output_low.get_node_shared_ptr(),
+                                                               m_output_high.get_node_shared_ptr()};
+            auto fq_node = std::dynamic_pointer_cast<op::FakeQuantize>(m_output.get_node_shared_ptr());
+            size_t idx = 0;
+            if (fq_node->get_auto_broadcast() != ngraph::op::AutoBroadcastType::NONE) {
+                for (auto const_node : fq_params_nodes) {
+                    auto new_shape = broadcast_shape_to_rank(const_node->get_shape(),
+                                                             m_input.get_partial_shape().rank().get_length());
+                    auto const_copy = const_node->clone_with_new_inputs(const_node->input_values());
+                    auto new_const = std::dynamic_pointer_cast<op::Constant>(const_copy);
+                    new_const->set_data_shape(new_shape);
+                    new_const->validate_and_infer_types();
+                    new_const->set_friendly_name(const_node->get_friendly_name());
+                    ngraph::copy_runtime_info(const_node, new_const);
+                    ngraph::replace_node(const_node, new_const);
+                    fq_params_nodes[idx++] = new_const;
+                }
+            }
+
+            auto fq_params_mask_callback = [input_mask_row](Mask::Ptr cur_mask) -> bool {
+                cur_mask->at(1/* fq params have same shapes as input */) = input_mask_row->at(1 /* channel dim in data */);
+                return true;
+            };
+
+            for (auto fq_param : fq_params_nodes) {
+                auto mask = std::make_shared<Mask>(fq_param->get_shape().size());
+                mask->add_callback(fq_params_mask_callback, input_mask);
+                input_mask->add_callback([mask](Mask::Ptr cur_mask) -> bool {
+                    return true;
+                }, mask);
+                mask->apply_callback(input_mask);
+                setMask(fq_param->output(0), mask);
+            }
+
+            return true;
+        };
+
+        auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, "FakeQuantizeMaskPropagation");
+        register_matcher(m, callback);
+    }
+};
+
+class ngraph::pass::mask_propagation::Concat : public MatcherPass{
+public:
+    Concat() {
+        auto concat = pattern::wrap_type<opset6::Concat>(pattern::has_static_shape());
+
+        ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+            const auto & pattern_map = m.get_pattern_value_map();
+            const auto & m_output = pattern_map.at(concat);
+            auto concat_ptr = std::dynamic_pointer_cast<opset6::Concat>(m_output.get_node_shared_ptr());
+            auto axis = concat_ptr->get_concatenation_axis();
+
+            auto inputs = concat_ptr->inputs();
+            std::map<int64_t , Mask::Ptr> input_masks;
+            std::map<int64_t , Mask *> input_masks_row;
+            std::vector<int64_t> input_sizes;
+
+            size_t first_input_idx = 0;
+            Mask::Ptr first_input_mask;
+            bool first_initialized = false;
+            for (size_t i=0; i < inputs.size(); i++) {
+                auto input = inputs[i];
+                auto input_mask = getMask(input.get_source_output());
+                if (input_mask) {
+                    input_masks[i] = input_mask;
+                    input_masks_row[i] = input_mask.get();
+
+                    if (!first_initialized) {
+                        first_input_idx = i;
+                        first_input_mask = input_mask;
+                        first_initialized = true;
+                    }
+                }
+                input_sizes.push_back(input.get_shape().at(axis));
+            }
+
+            if (!first_initialized) {
+                return false;
+            }
+
+            auto output_mask = std::make_shared<Mask>(m_output.get_partial_shape().rank().get_length());
+            auto output_mask_row = output_mask.get();
+
+            auto out_mask_callback = [input_masks_row, input_sizes, axis](Mask::Ptr cur_mask) -> bool {
+                int64_t cur_size = 0;
+                cur_mask->at(axis).clear();
+
+                for (size_t i=0; i < input_sizes.size(); ++i) {
+                    if (input_masks_row.count(i)) {
+                        for (auto idx : input_masks_row.at(i)->at(axis)) {
+                            cur_mask->at(axis).insert(idx + cur_size);
+                        }
+                    }
+                    cur_size += input_sizes[i];
+                }
+                return true;
+            };
+
+            auto create_input_mask_callback_for_idx = [output_mask_row, input_sizes, axis](size_t input_idx){
+                auto input_mask_callback = [output_mask_row, input_sizes, axis, input_idx](Mask::Ptr cur_mask) -> bool {
+                    cur_mask->clean_dim_values();
+                    uint64_t min_val = 0;
+                    for (size_t i = 0; i < input_idx; i++) {
+                        min_val += input_sizes[i];
+                    }
+                    uint64_t max_val = min_val + input_sizes[input_idx];
+                    for (auto idx : output_mask_row->at(axis)) {
+                        if (idx < max_val && idx >= min_val) {
+                            cur_mask->at(axis).insert(idx - min_val);
+                        }
+                    }
+                    return true;
+                };
+                return input_mask_callback;
+            };
+            output_mask->add_callback(out_mask_callback, first_input_mask);
+
+            for (size_t i=0; i < inputs.size(); ++i) {
+                if (input_masks.count(i) && i != first_input_idx) {
+                    auto input_mask = input_masks.at(i);
+                    input_mask->add_callback(create_input_mask_callback_for_idx(i),
+                                             first_input_mask);
+                    first_input_mask->add_callback([](Mask::Ptr cur_mask) -> bool {
+                        return true;
+                    }, input_mask);
+                }
+            }
+            first_input_mask->add_callback(create_input_mask_callback_for_idx(first_input_idx),
+                                     output_mask);
+            output_mask->apply_callback(first_input_mask);
+            setMask(m_output, output_mask);
+
+            return true;
+        };
+        auto m = std::make_shared<ngraph::pattern::Matcher>(concat, "ConcatMaskPropagation");
        register_matcher(m, callback);
    }
 };
@ -266,7 +554,9 @@ public:
 class ngraph::pass::mask_propagation::PassThrough : public MatcherPass {
 public:
    PassThrough() {
-        auto unary_op = pattern::wrap_type<op::util::UnaryElementwiseArithmetic, opset6::Clamp>();
+        auto unary_op = pattern::wrap_type<op::util::UnaryElementwiseArithmetic, opset6::Clamp,
+                                            opset6::Convert, opset6::ConvertLike, opset6::AvgPool, opset6::MaxPool,
+                                            opset6::ROIPooling, opset6::PSROIPooling, opset6::Pad>();

        ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
            const auto & pattern_map = m.get_pattern_value_map();
@ -312,5 +602,8 @@ ngraph::pass::PropagateMasks::PropagateMasks() {
    add_matcher<mask_propagation::GroupConvolution>();
    add_matcher<mask_propagation::Elementwise>();
    add_matcher<mask_propagation::PassThrough>();
+    add_matcher<mask_propagation::FakeQuantize>();
+    add_matcher<mask_propagation::Concat>();
+    add_matcher<mask_propagation::Reshape>();
    add_matcher<mask_propagation::StopPropagation>();
 }
--- a/inference-engine/src/offline_transformations/src/pruning/pruning.cpp
+++ b/inference-engine/src/offline_transformations/src/pruning/pruning.cpp
@ -15,8 +15,13 @@ NGRAPH_RTTI_DEFINITION(ngraph::pass::Pruning, "Pruning", 0);

 bool ngraph::pass::Pruning::run_on_function(std::shared_ptr<Function> f) {
    Manager manager(get_pass_config());
+
+    // Initialize masks only for Convolutions/GroupConvolutions weights (needed to init mask in source Constant of
+    // weights-calculating subgraph). For other node types masks initialized in PropagateMasks pass.
+    manager.register_pass<InitMasks>();
    manager.register_pass<PropagateMasks>();

+
 #ifdef NGRAPH_DEBUG_ENABLE
    // VisualizeTree modifier helps to print Masks and mark nodes with masks
    /*
--- a/inference-engine/src/offline_transformations/src/pruning/shrink_weights.cpp
+++ b/inference-engine/src/offline_transformations/src/pruning/shrink_weights.cpp
@ -54,6 +54,8 @@ bool ngraph::pass::ShrinkWeights::run_on_function(std::shared_ptr<ngraph::Functi
            for (size_t dim = 0; dim < mask->size(); ++dim) {
                const auto &dim_size = mask->at(dim).size();
                if (dim_size == 0) continue;
+                // Broadcastable 1-size dimension shouldn't be shrank with mask
+                if (const_node->get_shape().at(dim) == 1 && dim_size > 1) continue;

                // Convert dims that we want remove to dims that we need to keep
                std::vector<int64_t> dims_to_keep;
--- a/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp
@ -140,14 +140,6 @@ protected:
    virtual std::shared_ptr<IInferRequestInternal> CreateInferRequestImpl(InputsDataMap networkInputs,
                                                                          OutputsDataMap networkOutputs);

-    /**
-     * @brief Exports an internal hardware-dependent model to a stream.
-     * @note The function is called from IExecutableNetworkInternal::Export(std::ostream&),
-     * which performs common export first and calls this plugin-dependent implementation after.
-     * @param networkModel A stream to export network to.
-     */
-    virtual void ExportImpl(std::ostream& networkModel);
-
    InferenceEngine::InputsDataMap _networkInputs;  //!< Holds information about network inputs info
    InferenceEngine::OutputsDataMap _networkOutputs;  //!< Holds information about network outputs data

--- a/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
+++ b/inference-engine/src/plugin_api/cpp_interfaces/interface/ie_iplugin_internal.hpp
@ -286,29 +286,12 @@ protected:
                                                                           const std::map<std::string, std::string>& config);

    /**
-     * @brief Creates an executable network from an previously exported network
-     * @note The function is called from
-     * IInferencePlugin::ImportNetwork(std::istream&, const RemoteContext::Ptr&, const std::map<std::string, std::string>&)
-     * performs common steps first and calls this plugin-dependent implementation after.
-     * @param networkModel Reference to network model output stream
-     * @param config A string -> string map of parameters
-     * @return An Executable network
+     * @brief Set input and output information to executable network. This method is used to
+     * set addtional information to InferenceEngine::IExecutableNetworkInternal create by device plugin.
+     * @param exeNetwork An executable network object to set information to
+     * @param inputs An input information to set
+     * @param outputs An output information to set
     */
-    virtual std::shared_ptr<IExecutableNetworkInternal> ImportNetworkImpl(std::istream& networkModel,
-                                                                          const std::map<std::string, std::string>& config);
-
-    /**
-     * @brief Imports network wit RemoteContext
-     * @param networkModel Reference to network model output stream
-     * @param context - a pointer to plugin context derived from RemoteContext class used to
-     *        execute the network
-     * @param config A string -> string map of parameters
-     * @return An Executable network
-     */
-    virtual std::shared_ptr<IExecutableNetworkInternal> ImportNetworkImpl(std::istream& networkModel,
-                                                                          const std::shared_ptr<RemoteContext>& context,
-                                                                          const std::map<std::string, std::string>& config);
-
    void SetExeNetworkInfo(const std::shared_ptr<IExecutableNetworkInternal>& exeNetwork,
                           const ConstInputsDataMap& inputs,
                           const ConstOutputsDataMap& outputs);
--- a/Show More
+++ b/Show More