Merge remote-tracking branch 'upstream/master'

2021-06-11 13:30:59 +09:00 · 2021-06-11 13:30:59 +09:00 · 5a2cb750ec
commit 5a2cb750ec
parent 19ab2100a5 05469e95e9
236 changed files with 7127 additions and 1832 deletions
--- a/.ci/azure/linux.yml
+++ b/.ci/azure/linux.yml
@ -112,6 +112,7 @@ jobs:
        -DNGRAPH_ONNX_IMPORT_ENABLE=ON
        -DNGRAPH_ONNX_EDITOR_ENABLE=ON
        -DENABLE_FASTER_BUILD=ON
+        -DENABLE_STRICT_DEPENDENCIES=OFF
        -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules
        $(REPO_DIR)
      workingDirectory: $(BUILD_DIR)
--- a/.ci/azure/mac.yml
+++ b/.ci/azure/mac.yml
@ -90,7 +90,7 @@ jobs:
      # Disable errors with Ninja
      export CXXFLAGS="-Wno-error=unused-command-line-argument"
      export CFLAGS="-Wno-error=unused-command-line-argument"
-      cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR)
+      cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR)
    workingDirectory: $(BUILD_DIR)
    displayName: 'CMake'

--- a/.ci/azure/windows.yml
+++ b/.ci/azure/windows.yml
@ -92,7 +92,7 @@ jobs:

  - script: |
      set PATH=$(WORK_DIR)\ninja-win;%PATH%
-      call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR)
+      call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR)
    workingDirectory: $(BUILD_DIR)
    displayName: 'CMake'

--- a/.ci/openvino-onnx/Jenkinsfile
+++ b/.ci/openvino-onnx/Jenkinsfile
@ -113,8 +113,8 @@ def buildDockerImage(Map configuration, String workdir) {
        --build-arg BUILD_TYPE=${configuration.build_type} \
        --build-arg PROTOBUF_LITE=${configuration.protobuf_lite} \
        --file=.ci/openvino-onnx/Dockerfile \
-        --build-arg http_proxy=http://proxy-chain.intel.com:911/ \
-        --build-arg https_proxy=http://proxy-chain.intel.com:912/ .
+        --build-arg http_proxy=http://proxy-ir.intel.com:911/ \
+        --build-arg https_proxy=http://proxy-ir.intel.com:911/ .
    """
 }

--- a/.github/org_control/check_pr.py
+++ b/.github/org_control/check_pr.py
@ -139,7 +139,7 @@ def update_labels(gh_api, pull, non_org_intel_pr_users, non_org_pr_users):

 def get_wrong_commits(pull):
    """Returns commits with incorrect user and email"""
-    pr_author_email = pull.user.email.lower()
+    pr_author_email = (pull.user.email or "").lower()
    print("GitHub PR author email:", pr_author_email)
    print("Check commits:")
    wrong_commits = set()
@ -147,7 +147,7 @@ def get_wrong_commits(pull):
        # import pprint; pprint.pprint(commit.raw_data)
        print("Commit SHA:", commit.sha)
        # Use raw data because commit author can be non GitHub user
-        commit_email = commit.raw_data["commit"]["author"]["email"].lower()
+        commit_email = (commit.raw_data["commit"]["author"]["email"] or "").lower()
        print("    Commit email:", commit_email)
        if not github_api.is_valid_user(commit.author):
            print(
--- a/.github/workflows/code_style.yml
+++ b/.github/workflows/code_style.yml
@ -15,14 +15,17 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt --assume-yes install libusb-1.0-0-dev
+          python3 -m pip install --upgrade pip
          python3 -m pip install -r ./inference-engine/ie_bridges/python/requirements.txt
+          # Add for -DENABLE_PYTHON=ON, no cython
+          python3 -m pip install -r ./inference-engine/ie_bridges/python/src/requirements-dev.txt

      # Run cmake with -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT in order to enable codestyle check for ITT collector
      - name: CMake
        run: |
          mkdir build
          cd build
-          cmake -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT ..
+          cmake -DENABLE_PYTHON=ON -DENABLE_PROFILING_ITT=ON -DSELECTIVE_BUILD=COLLECT ..

      - name: Check code style
        run: cmake --build build --target clang_format_check_all
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@ -6,6 +6,8 @@ ie_dependent_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON "X8

 ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF)

+ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" ON)
+
 ie_dependent_option (ENABLE_CLDNN "clDnn based plugin for inference engine" ON "X86_64;NOT APPLE;NOT MINGW;NOT WINDOWS_STORE;NOT WINDOWS_PHONE" OFF)

 ie_option (ENABLE_PROFILING_ITT "Build with ITT tracing. Optionally configure pre-built ittnotify library though INTEL_VTUNE_DIR variable." OFF)
@ -18,8 +20,6 @@ Supported values:\

 ie_option (ENABLE_PROFILING_FIRST_INFERENCE "Build with ITT tracing of first inference time." ON)

-ie_option (ENABLE_DOCS "Build docs using Doxygen" OFF)
-
 ie_option(ENABLE_TEMPLATE_PLUGIN "Register template plugin into plugins.xml" OFF)

 ie_option_enum(SELECTIVE_BUILD "Enable OpenVINO conditional compilation or statistics collection. \
@ -33,6 +33,9 @@ ie_option(ENABLE_ERROR_HIGHLIGHT "Highlight errors and warnings during compile t
 find_package(PythonLibs 3 QUIET)
 ie_dependent_option (ENABLE_PYTHON "enables ie python bridge build" OFF "PYTHONLIBS_FOUND" OFF)

+find_package(PythonInterp 3 QUIET)
+ie_dependent_option (ENABLE_DOCS "Build docs using Doxygen" OFF "PYTHONINTERP_FOUND" OFF)
+
 #
 # enable or disable output from NGRAPH_DEBUG statements
 #
--- a/docs/IE_DG/API_Changes.md
+++ b/docs/IE_DG/API_Changes.md
@ -14,6 +14,15 @@ The sections below contain detailed list of changes made to the Inference Engine
 * InferenceEngine::Parameter(std::shared_ptr<ngraph::Variant>& var)
 * std::shared_ptr<ngraph::Variant> InferenceEngine::Parameter::asVariant() const
 * InferenceEngine::Parameter::operator std::shared_ptr<ngraph::Variant>() const
+ * KEY_CLDNN_NV12_TWO_INPUTS GPU plugin option. Use KEY_GPU_NV12_TWO_INPUTS instead
+ * KEY_CLDNN_PLUGIN_PRIORITY GPU plugin option. Use KEY_GPU_PLUGIN_PRIORITY instead
+ * KEY_CLDNN_PLUGIN_THROTTLE GPU plugin option. Use KEY_GPU_PLUGIN_THROTTLE instead
+ * KEY_CLDNN_MEM_POOL GPU plugin option
+ * KEY_CLDNN_GRAPH_DUMPS_DIR GPU plugin option
+ * KEY_CLDNN_SOURCES_DUMPS_DIR GPU plugin option
+ * KEY_DUMP_KERNELS GPU plugin option
+ * KEY_TUNING_MODE GPU plugin option
+ * KEY_TUNING_FILE GPU plugin option

 ## 2021.3

--- a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md
+++ b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md
@ -219,22 +219,6 @@ __kernel void example_relu_kernel(

 ## Debugging Tips<a name="debugging-tips"></a>

-* **Dumping the Resulting Kernels**.
-It is recommended to get a dump of the kernel with all of
-the values set by the Inference Engine, such as tensor sizes,
-floating-point, and integer kernel parameters. To get the dump, add the
-following line to your code that configures the GPU plugin to output the
-custom kernels:
-
-@snippet snippets/GPU_Kernel.cpp part1
-
-When the Inference Engine compiles the kernels for the specific network,
-it also outputs the resulting code for the custom kernels. In the
-directory of your executable, find files like
-`clDNN_program0.cl`, `clDNN_program1.cl`. There are as many files as
-distinct sets of parameters for your custom kernel: different input
-tensor sizes and kernel parameters.
-
 * **Using `printf` in the OpenCL™ Kernels**.
 To debug the specific values, you can use `printf` in your kernels.
 However, be careful: for instance, do not output excessively
--- a/docs/IE_DG/GPU_Kernels_Tuning.md
+++ b/docs/IE_DG/GPU_Kernels_Tuning.md
@ -1,39 +0,0 @@
-Using GPU Kernels Tuning {#openvino_docs_IE_DG_GPU_Kernels_Tuning}
-======================
-
-GPU Kernels Tuning allows you to tune models, so the heavy computational layers are configured to fit better into
-hardware, which the tuning was done on. It is required to achieve best performance on GPU.
-> **NOTE** Currently only convolution and fully connected layers undergo tuning process. It means that the performance boost depends on the amount of that layers in the model.
-
-OpenVINO™ releases include the `<INSTALL_DIR>/inference_engine/bin/intel64/Release/cache.json` file with pretuned data for current state of the art models. It is highly recommended to do the
-tuning for new kind of models, hardwares or drivers.
-
-## Tuned data
-
-GPU tuning data is saved in JSON format. The file is composed of 2 types of attributes and 1 type of value:
-* Execution units number (attribute): splits the content into different EU sections
-* Hash (attribute): hashed tuned kernel data
-* Key (value): Array with kernel name and kernel's mode index
-
-## Usage
-
---
-
-You can activate Kernels Tuning process by setting `KEY_TUNING_MODE` flag to `TUNING_CREATE` and `KEY_TUNING_FILE` to `<"filename">` in a configuration map that is
-passed to the plugin while loading a network.
-This configuration modifies the behavior of the `ExecutableNetwork` object. Instead of standard network compilation, it will run the tuning process.
-Please keep in mind that the tuning can be very time consuming. The bigger the network, the longer it will take.
-File with tuned data is the result of this step.
-
-> **NOTE** If a filename passed to `KEY_TUNING_FILE` points to existing tuned data and you are tuning a new model, then this file will be extended by new data. This allows you to extend existing `cache.json` provided in the OpenVINO™ release package. 
-
-The example below shows how to set and use the key files:
-
-@snippet snippets/GPU_Kernels_Tuning.cpp part0
-
---
-
-You can activate the inference with tuned data by setting `KEY_TUNING_MODE` flag to `TUNING_USE_EXISTING` and
-`KEY_TUNING_FILE` flag to `<"filename">`. 
-
-GPU backend will process the content of the file during network compilation to configure the OpenCL kernels for the best performance.
--- a/docs/IE_DG/Intro_to_Performance.md
+++ b/docs/IE_DG/Intro_to_Performance.md
@ -1,24 +1,29 @@
 # Introduction to the Performance Topics {#openvino_docs_IE_DG_Intro_to_Performance}

 This section is a shorter version of the
-[Optimization Guide](supported_plugins/MULTI.md) for the Intel Deep Learning Deployment Toolkit.
+[Optimization Guide](../optimization_guide/dldt_optimization_guide.md) for the Intel® Distribution of OpenVINO™ Toolkit.

 ## Precision
 Inference precision directly affects the performance. 

-Model Optimizer can produce an IR with different precision. For example, float16 IR initially targets VPU and GPU devices, while, for example, the CPU can also execute regular float32.
-Also, further device-specific inference precision settings are available, for example, [8-bit integer](Int8Inference.md) or [bfloat16](Bfloat16Inference.md) inference on the CPU.
-Note that for [MULTI device](supported_plugins/MULTI.md) that supports automatic inference on multiple devices in parallel, you can use the FP16 IR.
+Model Optimizer can produce an IR with different precision. For example, an FP16 IR initially targets VPU and GPU devices, while, for example, for the CPU, an FP16 IR is    typically up-scaled to the regular FP32 automatically upon loading. But notice that further device-specific inference precision settings are available, 
+for example, [8-bit integer](Int8Inference.md) or [bfloat16](Bfloat16Inference.md), which is specific to the CPU inference, below.
+Note that for the [MULTI device](supported_plugins/MULTI.md) plugin that supports automatic inference on multiple devices in parallel, you can use an FP16 IR (no need for FP32).
 You can find more information, including preferred data types for specific devices, in the
-[Supported Devices](supported_plugins/Supported_Devices.md) section.
+[Supported Devices](supported_plugins/Supported_Devices.md) document.

-## Lowering Inference Precision
-Default optimization is used for CPU and implies that inference is made with lower precision if it is possible on a given platform to reach better performance with acceptable range of accuracy.
-This approach can be used for CPU devices where the platform supports the AVX512_BF16 instruction. In this case, a regular float32 model is converted to [bfloat16](Bfloat16Inference.md) internal representation and inference is provided with bfloat16 layers usage.
-Below is the example command line to disable this feature on the CPU device with the AVX512_BF16 instruction and execute regular float32.
+## Automatic Lowering of the Inference Precision
+By default, plugins enable the optimizations that allow lower precision if the acceptable range of accuracy is preserved.
+For example, for the CPU that supports the AVX512_BF16 instructions, an FP16/FP32 model is converted to a [bfloat16](Bfloat16Inference.md) IR to accelerate inference.
+To compare the associated speedup, run the example command below to disable this feature on the CPU device with the AVX512_BF16 support and get regular FP32 execution:
 ```
 $ benchmark_app -m <model.xml> -enforcebf16=false
 ```
+Notice that for quantized (e.g. INT8) models the bfloat16 calculations (of the layers that remain in FP32) is disabled by default.
+Refer to the [CPU Plugin documentation](supported_plugins/CPU.md) for more details.
+
+Similarly, the GPU device automatically executes FP16 for the layers that remain in FP16 in the quantized models (assuming that the FP16 model was quantized).
+Refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md).

 ## Latency vs. Throughput
 One way to increase computational efficiency is batching, which combines many (potentially tens) of
@ -44,17 +49,17 @@ Below is the example command line that limits the execution to the single socket
 limited to the single socket).
 $ numactl -m 0 --physcpubind 0-27  benchmark_app -m <model.xml> -api sync -nthreads 28
 ```
-Note that if you have more than one input, running as many inference requests as you have NUMA nodes (or sockets)
+Note that if you have more than one input, running as many inference streams as you have NUMA nodes (or sockets)
 usually gives the same best latency as a single request on the single socket, but much higher throughput. Assuming two NUMA nodes machine:
 ```
 $ benchmark_app -m <model.xml> -nstreams 2
 ```
 Number of NUMA nodes on the machine can be queried via 'lscpu'.
-Please see more on the NUMA support in the [Optimization Guide](supported_plugins/MULTI.md).
+Please see more on the NUMA support in the [Optimization Guide](../optimization_guide/dldt_optimization_guide.md).

 ## Throughput Mode for CPU
 Unlike most accelerators, CPU is perceived as an inherently latency-oriented device. 
-Since 2018 R5 release, the Inference Engine introduced the "throughput" mode, which allows the Inference Engine to efficiently run multiple inference requests on the CPU simultaneously, greatly improving the throughput.
+OpenVINO™ toolkit provides a "throughput" mode that allows running multiple inference requests on the CPU simultaneously, which greatly improves the throughput.

 Internally, the execution resources are split/pinned into execution "streams".
 Using this feature gains much better performance for the networks that originally are not scaled well with a number of threads (for example, lightweight topologies). This is especially pronounced for the many-core server machines.
@ -62,8 +67,6 @@ Using this feature gains much better performance for the networks that originall
 Run the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) and play with number of infer requests running in parallel, next section. 
 Try different values of the `-nstreams` argument from `1` to a number of CPU cores and find one that provides the best performance. 

-In addition to the number of streams, it is also possible to play with the batch size to find the throughput sweet-spot.
-
 The throughput mode relaxes the requirement to saturate the CPU by using a large batch: running multiple independent inference requests in parallel often gives much better performance, than using a batch only.
 This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance.
 Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API.
@ -87,13 +90,3 @@ Try different values of the `-nstreams` argument from `1` to a number of CPU cor
 Finally, notice that when you don't specify number of streams with `-nstreams`, "AUTO" value for the streams is used, e.g. for the CPU this is [CPU_THROUGHPUT_AUTO](supported_plugins/CPU.md). You can spot the actual value behind "AUTO" for your machine in the application output.
 Notice that the "AUTO" number is not necessarily most optimal, so it is generally recommended to play either with the benchmark_app's "-nstreams" as described above, or via  [new Workbench tool](@ref workbench_docs_Workbench_DG_Introduction).This allows you to simplify the app-logic, as you don't need to combine multiple inputs into a batch to achieve good CPU performance.
 Instead, it is possible to keep a separate infer request per camera or another source of input and process the requests in parallel using Async API.
-
-## Kernels Tuning for GPU
-
-GPU backend comes with a feature, that allows models tuning, so the workload is configured to fit better into hardware.
-
-Tuning is time consuming process, which internally execute every layer several (or even hundreds) times to find most performant configuration.
-
-This configuration is saved into json-formatted file, whose name can be passed as plugin param to network. GPU backend will process this data to configure kernels for the best performance.
-
-For more details about Kernels Tuning and How-To please refer to [GPU Kernels Tuning](GPU_Kernels_Tuning.md). 
--- a/docs/IE_DG/supported_plugins/CL_DNN.md
+++ b/docs/IE_DG/supported_plugins/CL_DNN.md
@ -1,4 +1,4 @@
-GPU Plugin {#openvino_docs_IE_DG_supported_plugins_CL_DNN}
+GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU}
 =======

 The GPU plugin uses the Intel® Compute Library for Deep Neural Networks (clDNN) to infer deep neural networks.
@ -89,13 +89,10 @@ Some layers are executed during the load time, not during the inference. One of

 The following layers are not accelerated on the GPU and executed on the host CPU instead:
 * Proposal
-* SimplerNMS
+* NonMaxSuppression
 * PriorBox
 * DetectionOutput

-## Known Layers Limitations
-* ROIPooling is supported for 'max' value of 'method' attribute.
-
 ## Supported Configuration Parameters

 The plugin supports the configuration parameters listed below.
@ -107,31 +104,21 @@ When specifying key values as raw strings (that is, when using Python API), omit
 | `KEY_CACHE_DIR`      | `"<cache_dir>"`                    | `""`              | Specifies a directory where compiled OCL binaries can be cached. First model loading generates the cache, and all subsequent LoadNetwork calls use precompiled kernels which significantly improves load time. If empty - caching is disabled             |
 | `KEY_PERF_COUNT`      | `YES` / `NO`                    | `NO`              | Collect performance counters during inference             |
 | `KEY_CONFIG_FILE`     | `"<file1> [<file2> ...]"`         | `""`              | Load custom layer configuration files                     |
-| `KEY_DUMP_KERNELS`    | `YES` / `NO`                    | `NO`              | Dump the final kernels used for custom layers             |
-| `KEY_TUNING_MODE`     | `TUNING_DISABLED` <br /> `TUNING_CREATE` <br />  `TUNING_USE_EXISTING`            | `TUNING_DISABLED` | Disable inference kernel tuning     <br /> Create tuning file (expect much longer runtime)  <br />         Use an existing tuning file              |
-| `KEY_TUNING_FILE`     | `"<filename>"`                  | `""`              | Tuning file to create / use                               |
-| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for clDNN OpenCL queue. 0 disables the setting. |
-| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. |
-| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | clDNN graph optimizer stages dump output directory (in GraphViz format)                                     |
-| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | Final optimized clDNN OpenCL sources dump output directory                                   |
-| `KEY_GPU_THROUGHPUT_STREAMS`  | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).<br>This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_CLDNN_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low. <br>The default value is 1, which implies latency-oriented behavior.<br>`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams). <br> A positive integer value creates the requested number of streams. |
+| `KEY_GPU_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. |
+| `KEY_GPU_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. |
+| `KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS` | `YES` / `NO`                       | `YES`               | Allows using FP16+INT8 mixed precision mode, so non-quantized parts of a model will be executed in FP16 precision for FP16 IR. Does not affect quantized FP32 IRs |
+| `KEY_GPU_NV12_TWO_INPUTS` | `YES` / `NO`                       | `NO`               | Controls preprocessing logic for nv12 input. If it's set to YES, then device graph will expect that user will set biplanar nv12 blob as input wich will be directly passed to device execution graph. Otherwise, preprocessing via GAPI is used to convert NV12->BGR, thus GPU graph have to expect single input |
+| `KEY_GPU_THROUGHPUT_STREAMS`  | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).<br>This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_GPU_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low. <br>The default value is 1, which implies latency-oriented behavior.<br>`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams). <br> A positive integer value creates the requested number of streams. |
 | `KEY_EXCLUSIVE_ASYNC_REQUESTS` | `YES` / `NO`                | `NO`              | Forces async requests (also from different executable networks) to execute serially.|
-| `KEY_CLDNN_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` |  Specifies the number of CPU threads that can be used for clDNN engine, e.g, JIT compilation of clDNN kernels or clDNN cpu kernel processing. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the clDNN kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while clDNN plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of clDNN networks that are optimized with multi-threading. |
-| `KEY_CLDNN_ENABLE_LOOP_UNROLLING` | `YES` / `NO`             | `YES`             | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. |
-
-## Note on Debug Capabilities of the GPU Plugin
-
-Inference Engine GPU plugin provides possibility to dump the user custom OpenCL&trade; kernels to a file to allow you to properly debug compilation issues in your custom kernels.
-
-The application can use the <code>SetConfig()</code> function with the key <code>PluginConfigParams::KEY_DUMP_KERNELS</code> and value: <code>PluginConfigParams::YES</code>. Then during network loading, all custom layers will print their OpenCL kernels with the JIT instrumentation added by the plugin.
-The kernels will be stored in the working directory under files named the following way: <code>clDNN_program0.cl</code>, <code>clDNN_program1.cl</code>.
-
-This option is disabled by default. Additionally, the application can call the <code>SetConfig()</code> function with the key <code>PluginConfigParams::KEY_DUMP_KERNELS</code> and value: <code>PluginConfigParams::NO</code> before network loading.
-
-How to verify that this option is disabled:
-1.  Delete all <code>clDNN_program*.cl</code> files from the current directory
-2.  Run your application to load a network
-3.  Examine the working directory for the presence of any kernel file (for example, <code>clDNN_program0.cl</code>)
+| `KEY_GPU_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` |  Specifies the number of CPU threads that can be used for GPU engine, e.g, JIT compilation of GPU kernels or cpu kernel processing within GPU plugin. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the GPU kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while GPU plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of GPU networks that are optimized with multi-threading. |
+| `KEY_GPU_ENABLE_LOOP_UNROLLING` | `YES` / `NO`             | `YES`             | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. |
+| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_PRIORITY |
+| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_THROTTLE |
+| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | clDNN graph optimizer stages dump output directory (in GraphViz format) **Deprecated**. Will be removed in the next release                                     |
+| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | Final optimized clDNN OpenCL sources dump output directory. **Deprecated**. Will be removed in the next release                                   |
+| `KEY_DUMP_KERNELS`    | `YES` / `NO`                    | `NO`              | Dump the final kernels used for custom layers. **Deprecated**. Will be removed in the next release             |
+| `KEY_TUNING_MODE`     | `TUNING_DISABLED` <br /> `TUNING_CREATE` <br />  `TUNING_USE_EXISTING`            | `TUNING_DISABLED` | Disable inference kernel tuning     <br /> Create tuning file (expect much longer runtime)  <br />         Use an existing tuning file. **Deprecated**. Will be removed in the next release |
+| `KEY_TUNING_FILE`     | `"<filename>"`                  | `""`              | Tuning file to create / use. **Deprecated**. Will be removed in the next release |

 ## GPU Context and Video Memory Sharing RemoteBlob API

--- a/docs/IE_DG/supported_plugins/Supported_Devices.md
+++ b/docs/IE_DG/supported_plugins/Supported_Devices.md
@ -9,7 +9,7 @@ The Inference Engine provides unique capabilities to infer deep learning models

 | Plugin                                   | Device types                                                                                                                                                |
 |------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|[GPU plugin](CL_DNN.md)            |Intel&reg; Processor Graphics, including Intel&reg; HD Graphics and Intel&reg; Iris&reg; Graphics                                                            |
+|[GPU plugin](GPU.md)            |Intel&reg; Processor Graphics, including Intel&reg; HD Graphics and Intel&reg; Iris&reg; Graphics                                                            |
 |[CPU plugin](CPU.md)              |Intel&reg; Xeon&reg; with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel&reg; Core&trade; Processors with Intel&reg; AVX2, Intel&reg; Atom&reg; Processors with Intel® Streaming SIMD Extensions (Intel® SSE) |
 |[VPU plugins](VPU.md) (available in the Intel® Distribution of OpenVINO™ toolkit)            |Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X, Intel® Vision Accelerator Design with Intel® Movidius™ VPUs                                                                                           |
 |[GNA plugin](GNA.md) (available in the Intel® Distribution of OpenVINO™ toolkit)              |Intel&reg; Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel&reg; Pentium&reg; Silver J5005 Processor, Intel&reg; Pentium&reg; Silver N5000 Processor, Intel&reg; Celeron&reg; J4005 Processor, Intel&reg; Celeron&reg; J4105 Processor, Intel&reg; Celeron&reg; Processor N4100, Intel&reg; Celeron&reg; Processor N4000, Intel&reg; Core&trade; i3-8121U Processor, Intel&reg; Core&trade; i7-1065G7 Processor, Intel&reg; Core&trade; i7-1060G7 Processor, Intel&reg; Core&trade; i5-1035G4 Processor, Intel&reg; Core&trade; i5-1035G7 Processor, Intel&reg; Core&trade; i5-1035G1 Processor, Intel&reg; Core&trade; i5-1030G7 Processor, Intel&reg; Core&trade; i5-1030G4 Processor, Intel&reg; Core&trade; i3-1005G1 Processor, Intel&reg; Core&trade; i3-1000G1 Processor, Intel&reg; Core&trade; i3-1000G4 Processor|
--- a/docs/IE_PLUGIN_DG/ExecutableNetwork.md
+++ b/docs/IE_PLUGIN_DG/ExecutableNetwork.md
@ -49,20 +49,15 @@ The function accepts a const shared pointer to `ngraph::Function` object and per

 This constructor creates a backend specific graph by importing from a stream object:

-> **NOTE**: The export of backend specific graph is done in the `ExportImpl` method, and data formats must be the same for both import and export.
+> **NOTE**: The export of backend specific graph is done in the `Export` method, and data formats must be the same for both import and export.

@snippet src/template_executable_network.cpp executable_network:ctor_import_stream

-### `ExportImpl()`
-
-**Implementation details:**   
-Base InferenceEngine::ExecutableNetworkThreadSafeDefault class implements the public InferenceEngine::ExecutableNetworkThreadSafeDefault::Export method as following:
- Writes `_plugin->GetName()` to the `model` stream.
- Calls the `ExportImpl` method defined in a derived class to dump a backend specific graph.
+### `Export()`

 The implementation of the method should write all data to the `model` stream, which is required to import a backend specific graph later in the `Plugin::Import` method:

-@snippet src/template_executable_network.cpp executable_network:export_impl
+@snippet src/template_executable_network.cpp executable_network:export

 ### `CreateInferRequest()`

--- a/docs/IE_PLUGIN_DG/Plugin.md
+++ b/docs/IE_PLUGIN_DG/Plugin.md
@ -159,21 +159,13 @@ The snippet below provides an example of the implementation for `GetMetric`:

 > **NOTE**: If an unsupported metric key is passed to the function, it must throw an exception.

-### `ImportNetworkImpl()`
+### `ImportNetwork()`

 The importing network mechanism allows to import a previously exported backend specific graph and wrap it 
 using an [ExecutableNetwork](@ref executable_network) object. This functionality is useful if 
 backend specific graph compilation takes significant time and/or cannot be done on a target host 
 device due to other reasons.

-**Implementation details:** The base plugin class InferenceEngine::IInferencePlugin implements InferenceEngine::IInferencePlugin::ImportNetwork 
-as follows: exports a device type (InferenceEngine::IInferencePlugin::_pluginName) and then calls `ImportNetworkImpl`, 
-which is implemented in a derived class. 
-If a plugin cannot use the base implementation InferenceEngine::IInferencePlugin::ImportNetwork, it can override base 
-implementation and define an output blob structure up to its needs. This 
-can be useful if a plugin exports a blob in a special format for integration with other frameworks 
-where a common Inference Engine header from a base class implementation is not appropriate. 
-
 During export of backend specific graph using `ExecutableNetwork::Export`, a plugin may export any 
 type of information it needs to import a compiled graph properly and check its correctness. 
 For example, the export information may include:
--- a/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md
+++ b/docs/MO_DG/prepare_model/Model_Optimizer_FAQ.md
@ -628,3 +628,15 @@ It means that you trying to convert the topology which contains '_contrib_box_nm
 </script>

 \endhtmlonly
+
+#### 103. What does the message "ModelOptimizer is not able to parse *.caffemodel" mean? <a name="question-103"></a>
+
+If a '*.caffemodel' file exists and it is correct, the error possibly occured due to the use of Python protobuf implementation. In some cases, it shows error message during model parsing, for example: "'utf-8' codec can't decode byte 0xe0 in position 4: invalid continuation byte in field: mo_caffe.SpatialTransformerParameter.transform_type". You can either use Python 3.6/3.7 or build 'cpp' implementation of protobuf yourself for your version of Python. For the complete instructions about building `protobuf` from sources, see the appropriate section in [Converting a Model to Intermediate Representation](Config_Model_Optimizer.md).
+
+#### 104. What does the message "SyntaxError: 'yield' inside list comprehension" during MxNet\* model conversion mean? <a name="question-104"></a>
+
+The issue "SyntaxError: 'yield' inside list comprehension" might occur during converting MXNet\* models (mobilefacedet-v1-mxnet, brain-tumor-segmentation-0001) on Windows* platform with Python* 3.8 environment. This issue is caused by API changes for `yield expression` in Python 3.8.
+The following workarounds are suggested to resolve this issue:
+1. Use Python 3.6/3.7 to convert MXNet\* models on Windows
+2. Update MXNet: pip install mxnet=1.7.0.post2
+Note that you might have conflicts between previously installed PyPI dependencies.
--- a/docs/doxygen/ie_docs.xml
+++ b/docs/doxygen/ie_docs.xml
@ -88,6 +88,7 @@ limitations under the License.
        <!-- Intermediate Representation and Operations Sets -->
        <tab id="intermediate_representaton_and_operations_sets" type="usergroup" title="Intermediate Representation and Operations Sets" url="@ref openvino_docs_MO_DG_IR_and_opsets">
            <tab type="usergroup" title="Available Operations Sets" url="@ref openvino_docs_ops_opset">
+                <tab type="user" title="opset8 Specification" url="@ref openvino_docs_ops_opset8"/>
                <tab type="user" title="opset7 Specification" url="@ref openvino_docs_ops_opset7"/>
                <tab type="user" title="opset6 Specification" url="@ref openvino_docs_ops_opset6"/>
                <tab type="user" title="opset5 Specification" url="@ref openvino_docs_ops_opset5"/>
@ -100,6 +101,8 @@ limitations under the License.
                <tab type="user" title="Abs-1" url="@ref openvino_docs_ops_arithmetic_Abs_1"/>
                <tab type="user" title="Acos-1" url="@ref openvino_docs_ops_arithmetic_Acos_1"/>
                <tab type="user" title="Acosh-3" url="@ref openvino_docs_ops_arithmetic_Acosh_3"/>
+                <tab type="user" title="AdaptiveAvgPool-8" url="@ref openvino_docs_ops_pooling_AdaptiveAvgPool_8"/>
+                <tab type="user" title="AdaptiveMaxPool-8" url="@ref openvino_docs_ops_pooling_AdaptiveMaxPool_8"/>
                <tab type="user" title="Add-1" url="@ref openvino_docs_ops_arithmetic_Add_1"/>
                <tab type="user" title="Asin-1" url="@ref openvino_docs_ops_arithmetic_Asin_1"/>
                <tab type="user" title="Asinh-3" url="@ref openvino_docs_ops_arithmetic_Asinh_3"/>
@ -293,7 +296,6 @@ limitations under the License.
                <tab type="user" title="[DEPRECATED] Import an ONNX model" url="@ref openvino_docs_IE_DG_OnnxImporterTutorial"/>
                <tab type="user" title="Using Dynamic Batching Feature" url="@ref openvino_docs_IE_DG_DynamicBatching"/>
                <tab type="user" title="Using Static Shape Infer Feature" url="@ref openvino_docs_IE_DG_ShapeInference"/>
-                <tab type="user" title="Using GPU kernels tuning" url="@ref openvino_docs_IE_DG_GPU_Kernels_Tuning"/>
                <tab type="usergroup" title="Using Bfloat16 Inference" url="@ref openvino_docs_IE_DG_Bfloat16Inference">
                </tab>
                <tab type="usergroup" title="Using Low-Precision 8-bit Integer Inference" url="@ref openvino_docs_IE_DG_Int8Inference">
@ -303,7 +305,7 @@ limitations under the License.
                </tab>
                <tab type="user" title="Introduction to OpenVINO state API" url="@ref openvino_docs_IE_DG_network_state_intro"/>
                <tab type="usergroup" title="Supported Devices" url="@ref openvino_docs_IE_DG_supported_plugins_Supported_Devices">
-                    <tab type="usergroup" title="GPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_CL_DNN">
+                    <tab type="usergroup" title="GPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_GPU">
                        <tab type="user" title="RemoteBlob API of GPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_GPU_RemoteBlob_API"/>
                    </tab>
                    <tab type="user" title="CPU Plugin" url="@ref openvino_docs_IE_DG_supported_plugins_CPU"/>
--- a/docs/install_guides/installing-openvino-apt.md
+++ b/docs/install_guides/installing-openvino-apt.md
@ -2,7 +2,7 @@

 This guide provides installation steps for Intel® Distribution of OpenVINO™ toolkit for Linux* distributed through the APT repository.

-> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/en-us/license/eula-for-intel-software-development-products). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.
+> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/content/dam/develop/external/us/en/documents/intel-openvino-license-agreements.pdf). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.

 > **NOTE**: Intel® Graphics Compute Runtime for OpenCL™ is not a part of OpenVINO™ APT distribution. You can install it from the [Intel® Graphics Compute Runtime for OpenCL™ GitHub repo](https://github.com/intel/compute-runtime). 

--- a/docs/install_guides/installing-openvino-windows.md
+++ b/docs/install_guides/installing-openvino-windows.md
@ -248,8 +248,8 @@ Or proceed to the <a href="#get-started">Get Started</a> to get started with run

 > **NOTE**: These steps are required only if you want to use an Intel® integrated GPU.

-If your applications offload computation to **Intel® Integrated Graphics**, you must have the latest version of Intel Graphics Driver for Windows installed for your hardware. 
-[Download and install a higher version](http://downloadcenter.intel.com/product/80939/Graphics-Drivers). 
+If your applications offload computation to **Intel® Integrated Graphics**, you must have the Intel Graphics Driver for Windows installed for your hardware. 
+[Download and install the recommended version](https://downloadcenter.intel.com/download/30079/Intel-Graphics-Windows-10-DCH-Drivers). 

 To check if you have this driver installed:

@ -265,8 +265,6 @@ To check if you have this driver installed:

   ![](../img/DeviceDriverVersion.PNG)

-> **NOTE**: To use the **Intel® Iris® Xe MAX Graphics**, see the [Drivers & Software](https://downloadcenter.intel.com/download/29993/Intel-Iris-Xe-MAX-Dedicated-Graphics-Drivers?product=80939) page for driver downloads and installation instructions.  
-
 You are done updating your device driver and are ready to use your GPU. Proceed to the <a href="#get-started">Get Started</a> to get started with running code samples and demo applications.

 ### <a name="hddl-myriad"></a> Optional: Additional Installation Steps for the Intel® Vision Accelerator Design with Intel® Movidius™ VPUs
--- a/docs/install_guides/installing-openvino-yum.md
+++ b/docs/install_guides/installing-openvino-yum.md
@ -2,7 +2,7 @@

 This guide provides installation steps for the Intel® Distribution of OpenVINO™ toolkit for Linux* distributed through the YUM repository.

-> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/en-us/license/eula-for-intel-software-development-products). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.
+> **IMPORTANT**: By downloading and using this container and the included software, you agree to the terms and conditions of the [software license agreements](https://software.intel.com/content/dam/develop/external/us/en/documents/intel-openvino-license-agreements.pdf). Please, review the content inside the `<openvino_install_root>/licensing` folder for more details.

 > **NOTE**: Intel® Graphics Compute Runtime for OpenCL™ is not a part of OpenVINO™ YUM distribution. You can install it from the [Intel® Graphics Compute Runtime for OpenCL™ GitHub repo](https://github.com/intel/compute-runtime).

--- a/docs/model_server/README.md
+++ b/docs/model_server/README.md
@ -18,8 +18,8 @@ Review the [Architecture Concept](https://github.com/openvinotoolkit/model_serve
 A few key features:
 - Support for multiple frameworks. Serve models trained in popular formats such as Caffe\*, TensorFlow\*, MXNet\*, and ONNX*.
 - Deploy new [model versions](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md#model-version-policy) without changing client code.
- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU), 
-[GPU](../IE_DG/supported_plugins/CL_DNN), and [HDDL](../IE_DG/supported_plugins/HDDL). 
+- Support for AI accelerators including [Intel Movidius Myriad VPUs](../IE_DG/supported_plugins/VPU.md),
+[GPU](../IE_DG/supported_plugins/GPU.md), and [HDDL](../IE_DG/supported_plugins/HDDL.md).
 - The server can be enabled both on [Bare Metal Hosts](https://github.com/openvinotoolkit/model_server/blob/main/docs/host.md) or in
 [Docker* containers](https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md).
 - [Kubernetes deployments](https://github.com/openvinotoolkit/model_server/blob/main/deploy). The server can be deployed in a Kubernetes cluster allowing the inference service to scale horizontally and ensure high availability.
--- a/docs/ops/arithmetic/Log_1.md
+++ b/docs/ops/arithmetic/Log_1.md
@ -6,28 +6,28 @@

 **Short description**: *Log* performs element-wise natural logarithm operation with given tensor.

+**Detailed description**: *Log* does the following with the input tensor *a*:
+
+\f[
+a_{i} = log(a_{i})
+\f]
+
 **Attributes**:

    No attributes available.

 **Inputs**

-* **1**: An tensor of type T. **Required.**
+* **1**: An tensor of type T and arbitrary shape. **Required.**

 **Outputs**

-* **1**: The result of element-wise log operation. A tensor of type T.
+* **1**: The result of element-wise log operation. A tensor of type T and the same shape as input.

 **Types**

 * *T*: any numeric type.

-*Log* does the following with the input tensor *a*:
-
-\f[
-a_{i} = log(a_{i})
-\f]
-
 **Examples**

 *Example 1*
--- a/docs/ops/opset8.md
+++ b/docs/ops/opset8.md
@ -0,0 +1,169 @@
+# Operation Set `opset8` Specification {#openvino_docs_ops_opset8}
+
+This specification document describes the `opset8` operation set supported in OpenVINO™.
+Support for each particular operation from the list below depends on the capabilities of an inference plugin
+and may vary among different hardware platforms and devices. Examples of operation instances are provided as IR V10 xml
+snippets. Such IR is generated by the Model Optimizer. The semantics match corresponding nGraph operation classes
+declared in `namespace opset8`.
+
+
+## Table of Contents <a name="toc"></a>
+
+* [Abs](arithmetic/Abs_1.md)
+* [Acos](arithmetic/Acos_1.md)
+* [Acosh](arithmetic/Acosh_3.md)
+* [AdaptiveAvgPool](pooling/AdaptiveAvgPool_8.md)
+* [AdaptiveMaxPool](pooling/AdaptiveMaxPool_8.md)
+* [Add](arithmetic/Add_1.md)
+* [Asin](arithmetic/Asin_1.md)
+* [Asinh](arithmetic/Asinh_3.md)
+* [Assign](infrastructure/Assign_3.md)
+* [Atan](arithmetic/Atan_1.md)
+* [Atanh](arithmetic/Atanh_3.md)
+* [AvgPool](pooling/AvgPool_1.md)
+* [BatchNormInference](normalization/BatchNormInference_5.md)
+* [BatchToSpace](movement/BatchToSpace_2.md)
+* [BinaryConvolution](convolution/BinaryConvolution_1.md)
+* [Broadcast](movement/Broadcast_3.md)
+* [Bucketize](condition/Bucketize_3.md)
+* [CTCGreedyDecoder](sequence/CTCGreedyDecoder_1.md)
+* [CTCGreedyDecoderSeqLen](sequence/CTCGreedyDecoderSeqLen_6.md)
+* [CTCLoss](sequence/CTCLoss_4.md)
+* [Ceiling](arithmetic/Ceiling_1.md)
+* [Clamp](activation/Clamp_1.md)
+* [Concat](movement/Concat_1.md)
+* [Constant](infrastructure/Constant_1.md)
+* [Convert](type/Convert_1.md)
+* [ConvertLike](type/ConvertLike_1.md)
+* [Convolution](convolution/Convolution_1.md)
+* [ConvolutionBackpropData](convolution/ConvolutionBackpropData_1.md)
+* [Cos](arithmetic/Cos_1.md)
+* [Cosh](arithmetic/Cosh_1.md)
+* [CumSum](arithmetic/CumSum_3.md)
+* [DeformableConvolution](convolution/DeformableConvolution_1.md)
+* [DeformablePSROIPooling](detection/DeformablePSROIPooling_1.md)
+* [DepthToSpace](movement/DepthToSpace_1.md)
+* [DetectionOutput](detection/DetectionOutput_1.md)
+* [DFT](signals/DFT_7.md)
+* [Divide](arithmetic/Divide_1.md)
+* [Einsum](matrix/Einsum_7.md)
+* [Elu](activation/Elu_1.md)
+* [EmbeddingBagOffsetsSum](sparse/EmbeddingBagOffsetsSum_3.md)
+* [EmbeddingBagPackedSum](sparse/EmbeddingBagPackedSum_3.md)
+* [EmbeddingSegmentsSum](sparse/EmbeddingSegmentsSum_3.md)
+* [Equal](comparison/Equal_1.md)
+* [Erf](arithmetic/Erf_1.md)
+* [Exp](activation/Exp_1.md)
+* [ExperimentalDetectronDetectionOutput_6](detection/ExperimentalDetectronDetectionOutput_6.md)
+* [ExperimentalDetectronGenerateProposalsSingleImage_6](detection/ExperimentalDetectronGenerateProposalsSingleImage_6.md)
+* [ExperimentalDetectronPriorGridGenerator_6](detection/ExperimentalDetectronPriorGridGenerator_6.md)
+* [ExperimentalDetectronROIFeatureExtractor_6](detection/ExperimentalDetectronROIFeatureExtractor_6.md)
+* [ExperimentalDetectronTopKROIs_6](sort/ExperimentalDetectronTopKROIs_6.md)
+* [ExtractImagePatches](movement/ExtractImagePatches_3.md)
+* [FakeQuantize](quantization/FakeQuantize_1.md)
+* [Floor](arithmetic/Floor_1.md)
+* [FloorMod](arithmetic/FloorMod_1.md)
+* [Gather](movement/Gather_7.md)
+* [GatherElements](movement/GatherElements_6.md)
+* [GatherND_5](movement/GatherND_5.md)
+* [GatherTree](movement/GatherTree_1.md)
+* [Gelu](activation/GELU_7.md)
+* [Greater](comparison/Greater_1.md)
+* [GreaterEqual](comparison/GreaterEqual_1.md)
+* [GRN](normalization/GRN_1.md)
+* [GroupConvolution](convolution/GroupConvolution_1.md)
+* [GroupConvolutionBackpropData](convolution/GroupConvolutionBackpropData_1.md)
+* [GRUCell](sequence/GRUCell_3.md)
+* [GRUSequence](sequence/GRUSequence_5.md)
+* [HardSigmoid](activation/HardSigmoid_1.md)
+* [HSigmoid](activation/HSigmoid_5.md)
+* [HSwish](activation/HSwish_4.md)
+* [IDFT](signals/IDFT_7.md)
+* [Interpolate](image/Interpolate_4.md)
+* [Less](comparison/Less_1.md)
+* [LessEqual](comparison/LessEqual_1.md)
+* [Log](arithmetic/Log_1.md)
+* [LogicalAnd](logical/LogicalAnd_1.md)
+* [LogicalNot](logical/LogicalNot_1.md)
+* [LogicalOr](logical/LogicalOr_1.md)
+* [LogicalXor](logical/LogicalXor_1.md)
+* [LogSoftmax](activation/LogSoftmax_5.md)
+* [Loop](infrastructure/Loop_5.md)
+* [LRN](normalization/LRN_1.md)
+* [LSTMCell](sequence/LSTMCell_1.md)
+* [LSTMSequence](sequence/LSTMSequence_1.md)
+* [MatMul](matrix/MatMul_1.md)
+* [MaxPool](pooling/MaxPool_1.md)
+* [Maximum](arithmetic/Maximum_1.md)
+* [Minimum](arithmetic/Minimum_1.md)
+* [Mish](activation/Mish_4.md)
+* [Mod](arithmetic/Mod_1.md)
+* [MVN](normalization/MVN_6.md)
+* [Multiply](arithmetic/Multiply_1.md)
+* [Negative](arithmetic/Negative_1.md)
+* [NonMaxSuppression](sort/NonMaxSuppression_5.md)
+* [NonZero](condition/NonZero_3.md)
+* [NormalizeL2](normalization/NormalizeL2_1.md)
+* [NotEqual](comparison/NotEqual_1.md)
+* [OneHot](sequence/OneHot_1.md)
+* [Pad](movement/Pad_1.md)
+* [Parameter](infrastructure/Parameter_1.md)
+* [Power](arithmetic/Power_1.md)
+* [PReLU](activation/PReLU_1.md)
+* [PriorBoxClustered](detection/PriorBoxClustered_1.md)
+* [PriorBox](detection/PriorBox_1.md)
+* [Proposal](detection/Proposal_4.md)
+* [PSROIPooling](detection/PSROIPooling_1.md)
+* [Range](generation/Range_4.md)
+* [ReLU](activation/ReLU_1.md)
+* [ReadValue](infrastructure/ReadValue_3.md)
+* [ReduceL1](reduction/ReduceL1_4.md)
+* [ReduceL2](reduction/ReduceL2_4.md)
+* [ReduceLogicalAnd](reduction/ReduceLogicalAnd_1.md)
+* [ReduceLogicalOr](reduction/ReduceLogicalOr_1.md)
+* [ReduceMax](reduction/ReduceMax_1.md)
+* [ReduceMean](reduction/ReduceMean_1.md)
+* [ReduceMin](reduction/ReduceMin_1.md)
+* [ReduceProd](reduction/ReduceProd_1.md)
+* [ReduceSum](reduction/ReduceSum_1.md)
+* [RegionYolo](detection/RegionYolo_1.md)
+* [ReorgYolo](detection/ReorgYolo_1.md)
+* [Reshape](shape/Reshape_1.md)
+* [Result](infrastructure/Result_1.md)
+* [ReverseSequence](movement/ReverseSequence_1.md)
+* [RNNCell](sequence/RNNCell_3.md)
+* [RNNSequence](sequence/RNNSequence_5.md)
+* [ROIAlign](detection/ROIAlign_3.md)
+* [ROIPooling](detection/ROIPooling_1.md)
+* [Roll](movement/Roll_7.md)
+* [Round](arithmetic/Round_5.md)
+* [ScatterElementsUpdate](movement/ScatterElementsUpdate_3.md)
+* [ScatterNDUpdate](movement/ScatterNDUpdate_3.md)
+* [ScatterUpdate](movement/ScatterUpdate_3.md)
+* [Select](condition/Select_1.md)
+* [Selu](activation/Selu_1.md)
+* [ShapeOf](shape/ShapeOf_3.md)
+* [ShuffleChannels](movement/ShuffleChannels_1.md)
+* [Sigmoid](activation/Sigmoid_1.md)
+* [Sign](arithmetic/Sign_1.md)
+* [Sin](arithmetic/Sin_1.md)
+* [Sinh](arithmetic/Sinh_1.md)
+* [SoftMax](activation/SoftMax_1.md)
+* [SoftPlus](activation/SoftPlus_4.md)
+* [SpaceToBatch](movement/SpaceToBatch_2.md)
+* [SpaceToDepth](movement/SpaceToDepth_1.md)
+* [Split](movement/Split_1.md)
+* [Sqrt](arithmetic/Sqrt_1.md)
+* [SquaredDifference](arithmetic/SquaredDifference_1.md)
+* [Squeeze](shape/Squeeze_1.md)
+* [StridedSlice](movement/StridedSlice_1.md)
+* [Subtract](arithmetic/Subtract_1.md)
+* [Swish](activation/Swish_4.md)
+* [Tan](arithmetic/Tan_1.md)
+* [Tanh](arithmetic/Tanh_1.md)
+* [TensorIterator](infrastructure/TensorIterator_1.md)
+* [Tile](movement/Tile_1.md)
+* [TopK](sort/TopK_3.md)
+* [Transpose](movement/Transpose_1.md)
+* [Unsqueeze](shape/Unsqueeze_1.md)
+* [VariadicSplit](movement/VariadicSplit_1.md)
--- a/docs/ops/pooling/AdaptiveAvgPool_8.md
+++ b/docs/ops/pooling/AdaptiveAvgPool_8.md
@ -0,0 +1,70 @@
+## AdaptiveAvgPool<a name="AdaptiveAvgPool"></a> {#openvino_docs_ops_pooling_AdaptiveAvgPool_8}
+
+**Versioned name**: *AdaptiveAvgPool-8*
+
+**Category**: *Pooling*
+
+**Short description**: Applies average pooling with adaptive kernel size over the input.
+
+**Detailed description**: This operation calculates the output based on the first input and `output_size` determined by the second input.
+The kernel dimensions are calculated using the following formulae for the `NCDHW` input case:
+
+\f[
+\begin{array}{lcl}
+d_{start} &=& floor(i*D_{in}/D_{out})\\
+d_{end}   &=& ceil((i+1)*D_{in}/D_{out})\\
+h_{start} &=& floor(j*H_{in}/H_{out})\\
+h_{end}   &=& ceil((j+1)*H_{in}/H_{out})\\
+w_{start} &=& floor(k*W_{in}/W_{out})\\
+w_{end}   &=& ceil((k+1)*W_{in}/W_{out})
+\end{array}
+\f]
+
+The output is calculated with the following formula:
+
+\f[
+Output(i,j,k) = \frac{Input[d_{start}:d_{end}, h_{start}:h_{end}, w_{start}:w_{end}]}{(d_{end}-d_{start})*(h_{end}-h_{start})*(w_{end}-w_{start})}
+\f]
+
+**Inputs**:
+
+*   **1**: 3D, 4D, or 5D input tensor of shape `[N, C, H]`, `[N, C, H, W]` or `[N, C, D, H, W]` and type *T*. Required.
+*   **2**: 1D tensor describing output shape for spatial dimensions. Can be `[H_out]` for 3D input, `[H_out, W_out]` for 4D input, `[D_out, H_out, W_out]` for 5D input and of type *T_SHAPE*. Required.
+
+**Outputs**:
+
+*   **1**: Output of type *T* and shape `[N, C, H_out]`, `[N, C, H_out, W_out]` or `[N, C, D_out, H_out, W_out]`.
+
+**Types**
+
+*   *T*: floating-point type.
+*   *T_SHAPE*: `int32` or `int64`.
+
+**Examples**
+
+```xml
+<layer ... type="AdaptiveAvgPool" ... >
+    <data output_type="i64"/>
+    <input> 
+        <port id="0">
+            <dim>1</dim>
+            <dim>3</dim>
+            <dim>32</dim>
+            <dim>32</dim>
+        </port>
+    </input>
+    <input> 
+        <port id="1">
+            <dim>2</dim>
+        </port>
+    </input>
+    <output>
+        <port id="2">
+            <dim>1</dim>
+            <dim>3</dim>
+            <dim>16</dim>
+            <dim>16</dim>
+        </port>
+    </output>
+</layer>
+```
--- a/docs/ops/pooling/AdaptiveMaxPool_8.md
+++ b/docs/ops/pooling/AdaptiveMaxPool_8.md
@ -0,0 +1,87 @@
+## AdaptiveMaxPool<a name="AdaptiveMaxPool"></a> {#openvino_docs_ops_pooling_AdaptiveMaxPool_8}
+
+**Versioned name**: *AdaptiveMaxPool-8*
+
+**Category**: *Pooling*
+
+**Short description**: Applies max pooling with adaptive kernel size over the input.
+
+**Detailed description**: This operation calculates the output based on the first input and `output_size` determined by the second input.
+The kernel dimensions are calculated using the following formulae for the `NCDHW` input case:
+
+\f[
+\begin{array}{lcl}
+d_{start} &=& floor(i*D_{in}/D_{out})\\
+d_{end}   &=& ceil((i+1)*D_{in}/D_{out})\\
+h_{start} &=& floor(j*H_{in}/H_{out})\\
+h_{end}   &=& ceil((j+1)*H_{in}/H_{out})\\
+w_{start} &=& floor(k*W_{in}/W_{out})\\
+w_{end}   &=& ceil((k+1)*W_{in}/W_{out})
+\end{array}
+\f]
+
+The output is calculated following this formula:
+
+\f[
+Output(i,j,k) = max(Input[d_{start}:d_{end}, h_{start}:h_{end}, w_{start}:w_{end}])
+\f]
+
+**Attributes**:
+
+*   *index_element_type*
+
+  * **Description**: the type of the second output containing indices
+  * **Range of values**: "i64" or "i32"
+  * **Type**: string
+  * **Default value**: "i64"
+  * **Required**: *No*
+
+**Inputs**:
+
+*   **1**: 3D, 4D, or 5D input tensor of shape `[N, C, H]`, `[N, C, H, W]` or `[N, C, D, H, W]` and type *T*. Required.
+*   **2**: 1D tensor describing output shape for spatial dimensions. Can be `[H_out]` for 3D input, `[H_out, W_out]` for 4D input, `[D_out, H_out, W_out]` for 5D input and of type *T_SHAPE*. Required.
+
+**Outputs**:
+
+*   **1**: Output of type *T* and shape `[N, C, H_out]`, `[N, C, H_out, W_out]` or `[N, C, D_out, H_out, W_out]`.
+*   **2**: Output of type specified by *index_element_type* and same shape as the first output containing indices of elements in the first output. The values of indices are computed as if input was flatten 1-D tensor, so the values are in the range `[0, N * C * H * W * D)`.
+
+**Types**
+
+*   *T*: floating-point type.
+*   *T_SHAPE*: `int32` or `int64`.
+
+**Examples**
+
+```xml
+<layer ... type="AdaptiveMaxPool" ... >
+    <data output_type="i64"/>
+    <input> 
+        <port id="0">
+            <dim>1</dim>
+            <dim>3</dim>
+            <dim>32</dim>
+            <dim>32</dim>
+        </port>
+    </input>
+    <input> 
+        <port id="1">
+            <dim>2</dim>
+        </port>
+    </input>
+    <output>
+        <port id="1">
+            <dim>1</dim>
+            <dim>3</dim>
+            <dim>16</dim>
+            <dim>16</dim>
+        </port>
+        <port id="2">
+            <dim>1</dim>
+            <dim>3</dim>
+            <dim>16</dim>
+            <dim>16</dim>
+        </port>
+    </output>
+</layer>
+```
--- a/docs/optimization_guide/dldt_optimization_guide.md
+++ b/docs/optimization_guide/dldt_optimization_guide.md
@ -187,7 +187,7 @@ Inference Engine relies on the [Compute Library for Deep Neural Networks (clDNN)
 -	In the GPU-only scenario, a GPU driver might occupy a CPU core with spin-looped polling for completion. If the _CPU_ utilization is a concern, consider the `KEY_CLDND_PLUGIN_THROTTLE` configuration option.

 > **NOTE**: See the [Benchmark App Sample](../../inference-engine/samples/benchmark_app/README.md) code for a usage example.
-Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/CL_DNN.md). 
+Notice that while disabling the polling, this option might reduce the GPU performance, so usually this option is used with multiple [GPU streams](../IE_DG/supported_plugins/GPU.md).


 ### Intel&reg; Movidius&trade; Myriad&trade; X Visual Processing Unit and Intel&reg; Vision Accelerator Design with Intel&reg; Movidius&trade; VPUs  <a name="myriad"></a>
--- a/docs/snippets/GPU_Kernel.cpp
+++ b/docs/snippets/GPU_Kernel.cpp
@ -1,5 +1,4 @@
 #include <ie_core.hpp>
-#include "cldnn/cldnn_config.hpp"

 int main() {
 using namespace InferenceEngine;
@ -9,9 +8,5 @@ InferenceEngine::Core core;
 core.SetConfig({ { InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, "<path_to_the_xml_file>" } }, "GPU");
 //! [part0]

-//! [part1]
-core.SetConfig({ { PluginConfigParams::KEY_DUMP_KERNELS, PluginConfigParams::YES } }, "GPU");
-//! [part1]
-
 return 0;
 }
--- a/docs/snippets/GPU_Kernels_Tuning.cpp
+++ b/docs/snippets/GPU_Kernels_Tuning.cpp
@ -1,14 +0,0 @@
-#include <ie_core.hpp>
-#include "cldnn/cldnn_config.hpp"
-
-int main() {
-using namespace InferenceEngine;
-//! [part0]
-Core ie;          
-  ie.SetConfig({{ CONFIG_KEY(TUNING_MODE), CONFIG_VALUE(TUNING_CREATE) }}, "GPU");
-  ie.SetConfig({{ CONFIG_KEY(TUNING_FILE), "/path/to/tuning/file.json" }}, "GPU");
-  // Further LoadNetwork calls will use the specified tuning parameters
-//! [part0]
-
-return 0;
-}
--- a/docs/snippets/GPU_RemoteBlob_API2.cpp
+++ b/docs/snippets/GPU_RemoteBlob_API2.cpp
@ -1,6 +1,6 @@
 #include <ie_core.hpp>
 #include <gpu/gpu_context_api_va.hpp>
-#include <cldnn/cldnn_config.hpp>
+#include <gpu/gpu_config.hpp>


 int main() {
@ -28,7 +28,7 @@ auto shared_va_context = gpu::make_shared_context(ie, "GPU", disp);
 // compile network within a shared context
 ExecutableNetwork executable_network = ie.LoadNetwork(network,
                                                      shared_va_context,
-                                                      { { CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS,
+                                                      { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS,
                                                          PluginConfigParams::YES } });


--- a/docs/snippets/InferenceEngine_network_with_state_infer.cpp
+++ b/docs/snippets/InferenceEngine_network_with_state_infer.cpp
@ -64,7 +64,13 @@ int main(int argc, char *argv[]) {
            inferRequest.Infer();
            // check states
            auto states = inferRequest.QueryState();
+            if (states.empty()) {
+                throw std::runtime_error("Queried states are empty");
+            }
            auto mstate = as<MemoryBlob>(states[0].GetState());
+            if (mstate == nullptr) {
+                throw std::runtime_error("Can't cast state to MemoryBlob");
+            }
            auto state_buf = mstate->rmap();
            float * state =state_buf.as<float*>(); 
            std::cout << state[0] << "\n";
--- a/docs/template_plugin/src/template_executable_network.cpp
+++ b/docs/template_plugin/src/template_executable_network.cpp
@ -175,9 +175,9 @@ InferenceEngine::Parameter TemplatePlugin::ExecutableNetwork::GetMetric(const st
 }
 // ! [executable_network:get_metric]

-// ! [executable_network:export_impl]
-void TemplatePlugin::ExecutableNetwork::ExportImpl(std::ostream& modelStream) {
-    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::ExportImpl");
+// ! [executable_network:export]
+void TemplatePlugin::ExecutableNetwork::Export(std::ostream& modelStream) {
+    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "ExecutableNetwork::Export");

    // Note: custom ngraph extensions are not supported
    std::map<std::string, ngraph::OpSet> custom_opsets;
@ -198,4 +198,4 @@ void TemplatePlugin::ExecutableNetwork::ExportImpl(std::ostream& modelStream) {

    // TODO: implement network precision, layout, preprocessing info serialization
 }
-// ! [executable_network:export_impl]
+// ! [executable_network:export]
--- a/docs/template_plugin/src/template_executable_network.hpp
+++ b/docs/template_plugin/src/template_executable_network.hpp
@ -30,7 +30,7 @@ public:

    // Methods from a base class ExecutableNetworkThreadSafeDefault

-    void ExportImpl(std::ostream& model) override;
+    void Export(std::ostream& model) override;
    InferenceEngine::IInferRequestInternal::Ptr CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                                                       InferenceEngine::OutputsDataMap networkOutputs) override;
    InferenceEngine::IInferRequestInternal::Ptr CreateInferRequest() override;
--- a/docs/template_plugin/src/template_plugin.cpp
+++ b/docs/template_plugin/src/template_plugin.cpp
@ -95,14 +95,14 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::LoadExeNetworkImpl(cons
 }
 // ! [plugin:load_exe_network_impl]

-// ! [plugin:import_network_impl]
-InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::ImportNetworkImpl(std::istream& modelStream, const std::map<std::string, std::string>& config) {
-    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetworkImpl");
+// ! [plugin:import_network]
+InferenceEngine::IExecutableNetworkInternal::Ptr Plugin::ImportNetwork(std::istream& modelStream, const std::map<std::string, std::string>& config) {
+    OV_ITT_SCOPED_TASK(itt::domains::TemplatePlugin, "Plugin::ImportNetwork");

    auto fullConfig = Configuration {config, _cfg};
    return std::make_shared<ExecutableNetwork>(modelStream, fullConfig, std::static_pointer_cast<Plugin>(shared_from_this()));
 }
-// ! [plugin:import_network_impl]
+// ! [plugin:import_network]

 // ! [plugin:query_network]
 InferenceEngine::QueryNetworkResult Plugin::QueryNetwork(const InferenceEngine::CNNNetwork& network, const ConfigMap& config) const {
--- a/docs/template_plugin/src/template_plugin.hpp
+++ b/docs/template_plugin/src/template_plugin.hpp
@ -28,7 +28,7 @@ public:
    void AddExtension(const std::shared_ptr<InferenceEngine::IExtension>& extension) override;
    InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const override;
    InferenceEngine::Parameter GetMetric(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const override;
-    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& model, const std::map<std::string, std::string>& config) override;
+    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetwork(std::istream& model, const std::map<std::string, std::string>& config) override;

 private:
    friend class ExecutableNetwork;
--- a/inference-engine/cmake/dependencies.cmake
+++ b/inference-engine/cmake/dependencies.cmake
@ -295,25 +295,25 @@ if (ENABLE_SPEECH_DEMO)
    if(DEFINED IE_PATH_TO_DEPS)
        if (WIN32 AND X86_64)
            RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS
-                    ARCHIVE_WIN "speech_demo_1.0.0.755_windows.zip"
+                    ARCHIVE_WIN "speech_demo_1.0.0.774_windows.zip"
                    VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*"
-                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.755"
-                    SHA256 "58adef14b8a749f70fa83888614cee34b941956e6e958e445e3f48885b3c20a0")
+                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.774"
+                    SHA256 "67b25170be5e89a4f0e90e8b39623b60c9a15b965c30329385e295fcd2edc856")
            debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS})
        elseif (LINUX AND X86_64)
            if (LINUX_OS_NAME STREQUAL "CentOS 7" OR CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
                RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS
-                    ARCHIVE_LIN "speech_demo_1.0.0.755_centos.tgz"
+                    ARCHIVE_LIN "speech_demo_1.0.0.774_centos.tgz"
                    VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*"
-                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.755"
-                    SHA256 "716201e377714ac50f3909c445d36d47a089de50a557d8ef65232de040671188")
+                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.774"
+                    SHA256 "5ec3b7be9ae05376aefae5bd5fd4a39b12c274e82817fd3218120b8e8fc8ff5a")
                debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS})
            else()
                RESOLVE_DEPENDENCY(SPEECH_LIBS_AND_DEMOS
-                    ARCHIVE_LIN "speech_demo_1.0.0.755_linux.tgz"
+                    ARCHIVE_LIN "speech_demo_1.0.0.774_linux.tgz"
                    VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+.[0-9]+).*"
-                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.755"
-                    SHA256 "7714b8776ec0183ed73eed6d3d965ee6d5c15d2dc49ee5ae118cc368c89c7a9d")
+                    TARGET_PATH "${TEMP}/speech_demo_1.0.0.774"
+                    SHA256 "f0bbd0a6218b0365e7cfb1f860b34e4ace7e0d47dd60b369cdea8a480329810f")
                debug_message(STATUS "speech_libs_and_demos=" ${SPEECH_LIBS_AND_DEMOS})
            endif()
        else()
--- a/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md
+++ b/inference-engine/ie_bridges/python/sample/ngraph_function_creation_sample/README.md
@ -1,7 +1,8 @@
 # nGraph Function Creation Python* Sample {#openvino_inference_engine_ie_bridges_python_sample_ngraph_function_creation_sample_README}

-This sample demonstrates how to execute an inference using [nGraph function feature](../../../../../docs/nGraph_DG/build_function.md) to create a network that uses weights from LeNet classification network. So you don't need an XML file, the model will be created from the source code on the fly.  
-In addition to regular images, the sample also supports single-channel ubyte images as an input.
+This sample demonstrates how to execute an inference using [nGraph function feature](../../../../../docs/nGraph_DG/build_function.md) to create a network that uses weights from LeNet classification network, which is known to work well on digit classification tasks. So you don't need an XML file, the model will be created from the source code on the fly.  
+
+In addition to regular grayscale images with a digit, the sample also supports single-channel `ubyte` images as an input.

 The following Inference Engine Python API is used in the application:

@ -14,6 +15,9 @@ Basic Inference Engine API is covered by [Hello Classification Python* Sample](.

 | Options                    | Values                                                                  |
 | :------------------------- | :---------------------------------------------------------------------- |
+| Validated Models           | LeNet (image classification network)                                    |
+| Model Format               | Network weights file (\*.bin)                                           |
+| Validated images           | The sample uses OpenCV\* to [read input grayscale image](https://docs.opencv.org/master/d4/da8/group__imgcodecs.html#ga288b8b3da0892bd651fce07b3bbd3a56) (\*.bmp, \*.png) or single-channel `ubyte` image                                          |
 | Supported devices          | [All](../../../../../docs/IE_DG/supported_plugins/Supported_Devices.md) |
 | Other language realization | [C++](../../../../samples/ngraph_function_creation_sample)              |

@ -72,7 +76,7 @@ To run the sample, you need specify a model weights and image:
 You can do inference of an image using a pre-trained model on a GPU using the following command:

 ```sh
-python ngraph_function_creation_sample.py -m <path_to_model>/lenet.bin -i <path_to_image>/3.bmp -d GPU
+python ngraph_function_creation_sample.py -m <path_to_model>/lenet.bin -i <path_to_image>/3.png -d GPU
 ```

 ## Sample Output
@ -84,10 +88,10 @@ The sample application logs each step in a standard output stream and outputs to
 [ INFO ] Loading the network using ngraph function with weights from <path_to_model>/lenet.bin
 [ INFO ] Configuring input and output blobs
 [ INFO ] Loading the model to the plugin
-[ WARNING ] <path_to_image>/3.bmp is inverted to white over black
-[ WARNING ] <path_to_image>/3.bmp is resized from (100, 100) to (28, 28)
+[ WARNING ] <path_to_image>/3.png is inverted to white over black
+[ WARNING ] <path_to_image>/3.png is is resized from (351, 353) to (28, 28)
 [ INFO ] Starting inference in synchronous mode
-[ INFO ] Image path: <path_to_image>/3.bmp
+[ INFO ] Image path: <path_to_image>/3.png
 [ INFO ] Top 10 results:
 [ INFO ] classid probability
 [ INFO ] -------------------
--- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
@ -77,4 +77,5 @@ install(PROGRAMS __init__.py
        DESTINATION ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}/openvino/inference_engine
        COMPONENT ${PYTHON_VERSION})

-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}
+                        EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx")
--- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd
+++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd
@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0

 from .cimport ie_api_impl_defs as C
-from .ie_api_impl_defs cimport CBlob, CTensorDesc, InputInfo, CPreProcessChannel, CPreProcessInfo, CExecutableNetwork
+from .ie_api_impl_defs cimport CBlob, CTensorDesc, InputInfo, CPreProcessChannel, CPreProcessInfo, CExecutableNetwork, CVariableState

 import os

--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt
@ -42,7 +42,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    target_compile_options(${TARGET_NAME} PRIVATE "-Wno-error=register")
 endif()

-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}
+                        EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx")

 # perform copy
 add_custom_command(TARGET ${TARGET_NAME}
--- a/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt
@ -48,4 +48,5 @@ add_custom_command(TARGET ${TARGET_NAME}
    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/test_utils/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py
 )

-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME}
+                        EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx")
--- a/inference-engine/ie_bridges/python/tests/test_InferRequest.py
+++ b/inference-engine/ie_bridges/python/tests/test_InferRequest.py
@ -16,6 +16,20 @@ test_net_xml, test_net_bin = model_path(is_myriad)
 path_to_img = image_path()


+def create_function_with_memory(input_shape, data_type):
+    import ngraph as ng
+    from ngraph.impl import Function, Type
+
+    input_data = ng.parameter(input_shape, name="input_data", dtype=data_type)
+    rv = ng.read_value(input_data, "var_id_667")
+    add = ng.add(rv, input_data, name="MemoryAdd")
+    node = ng.assign(add, "var_id_667")
+    res = ng.result(add, "res")
+    func = Function(results=[res], sinks=[node], parameters=[input_data], name="name")
+    caps = Function.to_capsule(func)
+    return caps
+
+
 def read_image():
    import cv2
    n, c, h, w = (1, 3, 32, 32)
@ -525,28 +539,56 @@ def test_resize_algorithm_work(device):
    assert np.allclose(res_1, res_2, atol=1e-2, rtol=1e-2)


-# issue 56653
-@pytest.mark.skip(reason="Test will enable when nGraph Python API allows to create network with memory")
-def test_query_state(device):
-    import ngraph as ng
-    from ngraph.impl import Function
-    input_data = ng.parameter([5, 7], name="input_data", dtype=np.float32)
-    rv = ng.read_value(input_data, "var_id_667")
-    #a = ng.add(rv, input_data)
-    node = ng.assign(rv, "var_id_667")
-    res = ng.result(rv, "res")
-    func = Function([res], sinks=[node], parameters=[input_data], name='test')
-    caps = Function.to_capsule(func)
+@pytest.mark.parametrize("mode", ["set_init_memory_state", "reset_memory_state", "normal"])
+@pytest.mark.parametrize("data_type", ["FP32", "FP16", "I32"])
+@pytest.mark.parametrize("input_shape", [[10], [10, 10], [10, 10, 10], [2, 10, 10, 10]])
+@pytest.mark.skipif(os.environ.get("TEST_DEVICE", "CPU") != "CPU",
+                    reason=f"Can't run test on device {os.environ.get('TEST_DEVICE', 'CPU')}, "
+                    "Memory layers fully supported only on CPU")
+def test_query_state_write_buffer(device, input_shape, data_type, mode):
+    ie_core = ie.IECore()
+    if device == "CPU":
+        if ie_core.get_metric(device, "FULL_DEVICE_NAME") == "arm_compute::NEON":
+            pytest.skip("Can't run on ARM plugin")

-    net = ie.IENetwork(caps)
+    layout = ["C", "HW", "CHW", "NCHW"]
+    np_data_type = {"FP32": np.float32, "FP16": np.float16, "I32": np.int32}
+
+    from openvino.inference_engine import TensorDesc, Blob
+
+    net = ie.IENetwork(create_function_with_memory(input_shape, np_data_type[data_type]))
    ie_core = ie.IECore()
    exec_net = ie_core.load_network(network=net, device_name=device, num_requests=1)
    request = exec_net.requests[0]
    mem_states = request.query_state()
    mem_state = mem_states[0]
-    with pytest.raises(ValueError) as e:
-        ones_arr = np.ones(shape=(1, 800), dtype=np.float32)
-        mem_state.state.buffer[:] = ones_arr
-    assert "assignment destination is read-only" in str(e.value)
-    assert mem_state.name == 'id_1'
-    assert mem_state.state.tensor_desc.precision == 'FP32'
+
+    assert mem_state.name == 'var_id_667'
+    # todo: Uncomment after fix 45611,
+    #  CPU plugin returns outputs and memory state in FP32 in case of FP16 original precision
+    #assert mem_state.state.tensor_desc.precision == data_type
+
+    for i in range(1, 10):
+        if mode == "set_init_memory_state":
+            # create initial value
+            const_init = 5
+            init_array = np.full(input_shape, const_init, dtype=np_data_type[mem_state.state.tensor_desc.precision])
+            tensor_desc = TensorDesc(mem_state.state.tensor_desc.precision, input_shape, layout[len(input_shape) - 1])
+            blob = Blob(tensor_desc, init_array)
+            mem_state.state = blob
+
+            res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])})
+            expected_res = np.full(input_shape, 1 + const_init, dtype=np_data_type[data_type])
+        elif mode == "reset_memory_state":
+            # reset initial state of ReadValue to zero
+            mem_state.reset()
+            res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])})
+
+            # always ones
+            expected_res = np.full(input_shape, 1, dtype=np_data_type[data_type])
+        else:
+            res = exec_net.infer({"input_data": np.full(input_shape, 1, dtype=np_data_type[data_type])})
+            expected_res = np.full(input_shape, i, dtype=np_data_type[data_type])
+
+        assert np.allclose(res['MemoryAdd'], expected_res, atol=1e-6), \
+            "Expected values: {} \n Actual values: {} \n".format(expected_res, res)
--- a/inference-engine/include/cldnn/cldnn_config.hpp
+++ b/inference-engine/include/cldnn/cldnn_config.hpp
@ -11,47 +11,11 @@
 #pragma once

 #include "ie_plugin_config.hpp"
+#include "ie_api.h"
+#include "gpu/gpu_config.hpp"

 namespace InferenceEngine {

-namespace Metrics {
-
-/**
- * @def GPU_METRIC_KEY(name)
- * @brief shortcut for defining GPU plugin metrics
- */
-#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name)
-#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__)
-
-/**
- * @def DECLARE_GPU_METRIC_VALUE(name)
- * @brief shortcut for defining gpu metric values
- */
-#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name)
-
-/**
- * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size
- */
-DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t);
-
-/**
- * @brief Metric to get microarchitecture identifier in major.minor.revision format
- */
-DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string);
-
-/**
- * @brief Metric to get count of execution units for current GPU
- */
-DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int);
-
-/**
- * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric
- *  - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication
- */
-DECLARE_GPU_METRIC_VALUE(HW_MATMUL);
-
-}  // namespace Metrics
-
 /**
 * @brief GPU plugin configuration
 */
@ -70,6 +34,7 @@ namespace CLDNNConfigParams {
 * this option should be used with an unsigned integer value (1 is lowest priority)
 * 0 means no priority hint is set and default queue is created.
 */
+INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_PRIORITY instead")
 DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY);

 /**
@ -78,22 +43,26 @@ DECLARE_CLDNN_CONFIG_KEY(PLUGIN_PRIORITY);
 * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption)
 * 0 means no throttle hint is set and default queue created.
 */
+INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_PLUGIN_THROTTLE instead")
 DECLARE_CLDNN_CONFIG_KEY(PLUGIN_THROTTLE);

 /**
 * @brief This key controls clDNN memory pool optimization.
 * Turned off by default.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CLDNN_CONFIG_KEY(MEM_POOL);

 /**
 * @brief This key defines the directory name to which clDNN graph visualization will be dumped.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR);

 /**
 * @brief This key defines the directory name to which full program sources will be dumped.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR);

 /**
@ -108,43 +77,19 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
 * @brief This key should be set to correctly handle NV12 input without pre-processing.
 * Turned off by default.
 */
+INFERENCE_ENGINE_DEPRECATED("Use InferenceEngine::GPUConfigParams::GPU_NV12_TWO_INPUTS instead")
 DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS);

-/**
- * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
- * Default value is maximum number of threads available in the environment.
- */
-DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS);
-
-/**
- * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count.
- * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb).
- * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16).
- * Note that turning this key on will increase the graph loading time in proportion to the iteration counts.
- * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/
-DECLARE_CLDNN_CONFIG_KEY(ENABLE_LOOP_UNROLLING);
-
 }  // namespace CLDNNConfigParams

 namespace PluginConfigParams {

-/**
- * @brief Optimize GPU plugin execution to maximize throughput.
- *
- * It is passed to Core::SetConfig(), this option should be used with values:
- * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases,
- *   this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance
- * drop
- * - a positive integer value creates the requested number of streams
- */
-DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO);
-DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS);
-
 /**
 * @brief This key enables dumping of the kernels used by the plugin for custom layers.
 *
 * This option should be used with values: PluginConfigParams::YES or PluginConfigParams::NO (default)
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CONFIG_KEY(DUMP_KERNELS);

 /**
@ -159,17 +104,24 @@ DECLARE_CONFIG_KEY(DUMP_KERNELS);
 *
 * For values TUNING_CREATE and TUNING_RETUNE the file will be created if it does not exist.
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CONFIG_KEY(TUNING_MODE);

+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_CREATE);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_USE_EXISTING);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_DISABLED);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_UPDATE);
+INFERENCE_ENGINE_DEPRECATED("The config value will be removed")
 DECLARE_CONFIG_VALUE(TUNING_RETUNE);

 /**
 * @brief This key defines the tuning data filename to be created/used
 */
+INFERENCE_ENGINE_DEPRECATED("The config key will be removed")
 DECLARE_CONFIG_KEY(TUNING_FILE);

 }  // namespace PluginConfigParams
--- a/inference-engine/include/gpu/gpu_config.hpp
+++ b/inference-engine/include/gpu/gpu_config.hpp
@ -0,0 +1,120 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief A header for advanced hardware related properties for GPU plugin
+ *        To use in SetConfig() method of plugins
+ *
+ * @file gpu_config.hpp
+ */
+#pragma once
+
+#include "ie_plugin_config.hpp"
+
+namespace InferenceEngine {
+
+namespace Metrics {
+
+/**
+ * @def GPU_METRIC_KEY(name)
+ * @brief shortcut for defining GPU plugin metrics
+ */
+#define GPU_METRIC_KEY(name) METRIC_KEY(GPU_##name)
+#define DECLARE_GPU_METRIC_KEY(name, ...) DECLARE_METRIC_KEY(GPU_##name, __VA_ARGS__)
+
+/**
+ * @def DECLARE_GPU_METRIC_VALUE(name)
+ * @brief shortcut for defining gpu metric values
+ */
+#define DECLARE_GPU_METRIC_VALUE(name) DECLARE_METRIC_VALUE(GPU_##name)
+
+/**
+ * @brief Metric which defines size of memory in bytes available for the device. For iGPU it returns host memory size, for dGPU - dedicated gpu memory size
+ */
+DECLARE_GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE, uint64_t);
+
+/**
+ * @brief Metric to get microarchitecture identifier in major.minor.revision format
+ */
+DECLARE_GPU_METRIC_KEY(UARCH_VERSION, std::string);
+
+/**
+ * @brief Metric to get count of execution units for current GPU
+ */
+DECLARE_GPU_METRIC_KEY(EXECUTION_UNITS_COUNT, int);
+
+/**
+ * @brief Possible return value for OPTIMIZATION_CAPABILITIES metric
+ *  - "HW_MATMUL" - Defines if device has hardware block for matrix multiplication
+ */
+DECLARE_GPU_METRIC_VALUE(HW_MATMUL);
+
+}  // namespace Metrics
+
+/**
+ * @brief GPU plugin configuration
+ */
+namespace GPUConfigParams {
+
+/**
+ * @brief shortcut for defining configuration keys
+ */
+#define GPU_CONFIG_KEY(name) InferenceEngine::GPUConfigParams::_CONFIG_KEY(GPU_##name)
+#define DECLARE_GPU_CONFIG_KEY(name) DECLARE_CONFIG_KEY(GPU_##name)
+#define DECLARE_GPU_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(GPU_##name)
+
+/**
+ * @brief This key instructs the GPU plugin to use the OpenCL queue priority hint
+ * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf
+ * this option should be used with an unsigned integer value (1 is lowest priority)
+ * 0 means no priority hint is set and default queue is created.
+ */
+DECLARE_GPU_CONFIG_KEY(PLUGIN_PRIORITY);
+
+/**
+ * @brief This key instructs the GPU plugin to use throttle hints the OpenCL queue throttle hint
+ * as defined in https://www.khronos.org/registry/OpenCL/specs/opencl-2.1-extensions.pdf,
+ * chapter 9.19. This option should be used with an unsigned integer value (1 is lowest energy consumption)
+ * 0 means no throttle hint is set and default queue created.
+ */
+DECLARE_GPU_CONFIG_KEY(PLUGIN_THROTTLE);
+
+/**
+ * @brief This key should be set to correctly handle NV12 input without pre-processing.
+ * Turned off by default.
+ */
+DECLARE_GPU_CONFIG_KEY(NV12_TWO_INPUTS);
+
+/**
+ * @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
+ * Default value is maximum number of threads available in the environment.
+ */
+DECLARE_GPU_CONFIG_KEY(MAX_NUM_THREADS);
+
+/**
+ * @brief Turning on this key enables to unroll recurrent layers such as TensorIterator or Loop with fixed iteration count.
+ * This key is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb).
+ * Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16).
+ * Note that turning this key on will increase the graph loading time in proportion to the iteration counts.
+ * Thus, this key should be turned off if graph loading time is considered to be most important target to optimize.*/
+DECLARE_GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING);
+
+}  // namespace GPUConfigParams
+
+namespace PluginConfigParams {
+
+/**
+ * @brief Optimize GPU plugin execution to maximize throughput.
+ *
+ * It is passed to Core::SetConfig(), this option should be used with values:
+ * - KEY_GPU_THROUGHPUT_AUTO creates bare minimum of streams that might improve performance in some cases,
+ *   this option allows to enable throttle hint for opencl queue thus reduce CPU load without significant performance
+ * drop
+ * - a positive integer value creates the requested number of streams
+ */
+DECLARE_CONFIG_VALUE(GPU_THROUGHPUT_AUTO);
+DECLARE_CONFIG_KEY(GPU_THROUGHPUT_STREAMS);
+}  // namespace PluginConfigParams
+
+}  // namespace InferenceEngine
--- a/inference-engine/include/ie_core.hpp
+++ b/inference-engine/include/ie_core.hpp
@ -174,9 +174,18 @@ public:
     * operation*
     * @return An executable network reference
     */
-    ExecutableNetwork ImportNetwork(std::istream& networkModel, const std::string& deviceName = {},
+    ExecutableNetwork ImportNetwork(std::istream& networkModel, const std::string& deviceName,
                                    const std::map<std::string, std::string>& config = {});

+    /**
+     * @deprecated Use Core::ImportNetwork with explicit device name
+     * @brief Creates an executable network from a previously exported network
+     * @param networkModel network model stream
+     * @return An executable network reference
+     */
+    INFERENCE_ENGINE_DEPRECATED("Use Core::ImportNetwork with explicit device name")
+    ExecutableNetwork ImportNetwork(std::istream& networkModel);
+
    /**
     * @brief Creates an executable network from a previously exported network within a specified
     * remote context.
--- a/inference-engine/include/ie_version.hpp
+++ b/inference-engine/include/ie_version.hpp
@ -20,8 +20,8 @@
 * @brief Defines Inference Engine patch version
 */

-#define IE_VERSION_MAJOR 2021
-#define IE_VERSION_MINOR 4
+#define IE_VERSION_MAJOR 2022
+#define IE_VERSION_MINOR 1
 #define IE_VERSION_PATCH 0

 #include "ie_api.h"
--- a/inference-engine/samples/benchmark_app/inputs_filling.cpp
+++ b/inference-engine/samples/benchmark_app/inputs_filling.cpp
@ -39,6 +39,7 @@ std::vector<std::string> filterFilesByExtensions(const std::vector<std::string>&
    return filtered;
 }

+template <typename T>
 void fillBlobImage(Blob::Ptr& inputBlob, const std::vector<std::string>& filePaths, const size_t& batchSize, const benchmark_app::InputInfo& app_info,
                   const size_t& requestId, const size_t& inputId, const size_t& inputSize) {
    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
@ -50,7 +51,7 @@ void fillBlobImage(Blob::Ptr& inputBlob, const std::vector<std::string>& filePat
    // locked memory holder should be alive all time while access to its buffer
    // happens
    auto minputHolder = minput->wmap();
-    auto inputBlobData = minputHolder.as<uint8_t*>();
+    auto inputBlobData = minputHolder.as<T*>();

    /** Collect images data ptrs **/
    std::vector<std::shared_ptr<uint8_t>> vreader;
@ -90,7 +91,7 @@ void fillBlobImage(Blob::Ptr& inputBlob, const std::vector<std::string>& filePat
                    size_t offset = imageId * numChannels * width * height + (((app_info.layout == "NCHW") || (app_info.layout == "CHW"))
                                                                                  ? (ch * width * height + h * width + w)
                                                                                  : (h * width * numChannels + w * numChannels + ch));
-                    inputBlobData[offset] = vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch];
+                    inputBlobData[offset] = static_cast<T>(vreader.at(imageId).get()[h * width * numChannels + w * numChannels + ch]);
                }
            }
        }
@ -142,7 +143,7 @@ using uniformDistribution =
                              typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;

 template <typename T, typename T2>
-void fillBlobRandom(Blob::Ptr& inputBlob, T rand_min = std::numeric_limits<T>::min(), T rand_max = std::numeric_limits<T>::max()) {
+void fillBlobRandom(Blob::Ptr& inputBlob, T rand_min = std::numeric_limits<uint8_t>::min(), T rand_max = std::numeric_limits<uint8_t>::max()) {
    MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
    if (!minput) {
        IE_THROW() << "We expect inputBlob to be inherited from MemoryBlob in "
@ -270,7 +271,19 @@ void fillBlobs(const std::vector<std::string>& inputFiles, const size_t& batchSi
            if (app_info.isImage()) {
                if (!imageFiles.empty()) {
                    // Fill with Images
-                    fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    if (precision == InferenceEngine::Precision::FP32) {
+                        fillBlobImage<float>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::FP16) {
+                        fillBlobImage<short>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::I32) {
+                        fillBlobImage<int32_t>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::I64) {
+                        fillBlobImage<int64_t>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else if (precision == InferenceEngine::Precision::U8) {
+                        fillBlobImage<uint8_t>(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+                    } else {
+                        IE_THROW() << "Input precision is not supported for " << item.first;
+                    }
                    continue;
                }
            } else {
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@ -4,8 +4,8 @@

 #include <algorithm>
 #include <chrono>
-#include <cldnn/cldnn_config.hpp>
 #include <gna/gna_config.hpp>
+#include <gpu/gpu_config.hpp>
 #include <inference_engine.hpp>
 #include <map>
 #include <memory>
@ -282,7 +282,7 @@ int main(int argc, char* argv[]) {
                               << "which releases another CPU thread (that is otherwise "
                                  "used by the GPU driver for active polling)"
                               << slog::endl;
-                    device_config[CLDNN_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
+                    device_config[GPU_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
                }
            } else if (device == "MYRIAD") {
                device_config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING);
--- a/inference-engine/samples/hello_query_device/README.md
+++ b/inference-engine/samples/hello_query_device/README.md
@ -63,20 +63,20 @@ Available devices:
                SUPPORTED_METRICS : [ AVAILABLE_DEVICES SUPPORTED_METRICS FULL_DEVICE_NAME OPTIMIZATION_CAPABILITIES SUPPORTED_CONFIG_KEYS RANGE_FOR_ASYNC_INFER_REQUESTS RANGE_FOR_STREAMS ]
                FULL_DEVICE_NAME : Intel(R) UHD Graphics 620 (iGPU)
                OPTIMIZATION_CAPABILITIES : [ FP32 BIN FP16 ]
-                SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR CLDNN_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR CLDNN_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ]
+                SUPPORTED_CONFIG_KEYS : [ CACHE_DIR CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS CLDNN_GRAPH_DUMPS_DIR GPU_MAX_NUM_THREADS CLDNN_MEM_POOL CLDNN_NV12_TWO_INPUTS CLDNN_PLUGIN_PRIORITY CLDNN_PLUGIN_THROTTLE CLDNN_SOURCES_DUMPS_DIR GPU_ENABLE_LOOP_UNROLLING CONFIG_FILE DEVICE_ID DUMP_KERNELS DYN_BATCH_ENABLED EXCLUSIVE_ASYNC_REQUESTS GPU_THROUGHPUT_STREAMS PERF_COUNT TUNING_FILE TUNING_MODE ]
                RANGE_FOR_ASYNC_INFER_REQUESTS : { 1, 2, 1 }
                RANGE_FOR_STREAMS : { 1, 2 }
        Default values for device configuration keys:
                CACHE_DIR : ""
                CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS : YES
                CLDNN_GRAPH_DUMPS_DIR : ""
-                CLDNN_MAX_NUM_THREADS : 8
                CLDNN_MEM_POOL : YES
                CLDNN_NV12_TWO_INPUTS : NO
                CLDNN_PLUGIN_PRIORITY : 0
                CLDNN_PLUGIN_THROTTLE : 0
                CLDNN_SOURCES_DUMPS_DIR : ""
-                CLDNN_ENABLE_LOOP_UNROLLING : YES
+                GPU_MAX_NUM_THREADS : 8
+                GPU_ENABLE_LOOP_UNROLLING : YES
                CONFIG_FILE : ""
                DEVICE_ID : ""
                DUMP_KERNELS : NO
--- a/inference-engine/samples/ngraph_function_creation_sample/README.md
+++ b/inference-engine/samples/ngraph_function_creation_sample/README.md
@ -1,6 +1,6 @@
 # nGraph Function Creation C++ Sample {#openvino_inference_engine_samples_ngraph_function_creation_sample_README}

-This sample demonstrates how to execute an synchronous inference using [nGraph function feature](../../../docs/nGraph_DG/build_function.md) to create a network, which uses weights from LeNet classification network.
+This sample demonstrates how to execute an synchronous inference using [nGraph function feature](../../../docs/nGraph_DG/build_function.md) to create a network, which uses weights from LeNet classification network, which is known to work well on digit classification tasks.

 The sample supports only single-channel `ubyte` images as an input.

--- a/inference-engine/samples/speech_sample/fileutils.cpp
+++ b/inference-engine/samples/speech_sample/fileutils.cpp
@ -108,7 +108,7 @@ void NumpyFile::GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, u
    cnpy::npz_t my_npz1 = cnpy::npz_load(fileName);
    auto it = my_npz1.begin();
    std::advance(it, numArrayToFindSize);
-
+    if (it != my_npz1.end()) {
        numArrays = my_npz1.size();
        cnpy::NpyArray my_npy = it->second;
        numMemoryBytes = my_npy.data_holder->size();
@ -117,6 +117,9 @@ void NumpyFile::GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, u
            *ptrNumArrays = numArrays;
        if (ptrNumMemoryBytes != NULL)
            *ptrNumMemoryBytes = numMemoryBytes;
+    } else {
+        throw std::runtime_error(std::string("Failed to get info %s  GetFileInfo()!\n") + fileName);
+    }
 }

 void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
@ -124,6 +127,7 @@ void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string&
    cnpy::npz_t my_npz1 = cnpy::npz_load(fileName);
    auto it = my_npz1.begin();
    std::advance(it, arrayIndex);
+    if (it != my_npz1.end()) {
        ptrName = it->first;
        cnpy::NpyArray my_npy = it->second;
        *ptrNumRows = my_npy.shape[0];
@ -134,6 +138,9 @@ void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string&
        }

        *ptrNumBytesPerElement = sizeof(float);
+    } else {
+        throw std::runtime_error(std::string("Failed to open %s for reading in LoadFile()!\n") + fileName);
+    }
 }

 void NumpyFile::SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) {
--- a/inference-engine/src/auto_plugin/auto_exec_network.cpp
+++ b/inference-engine/src/auto_plugin/auto_exec_network.cpp
@ -3,10 +3,8 @@
 //

 #include <string>
-#include <vector>
 #include <memory>
 #include <map>
-#include <unordered_map>

 #include "ie_metric_helpers.hpp"
 #include "auto_exec_network.hpp"
@ -15,8 +13,8 @@
 namespace AutoPlugin {
 using namespace InferenceEngine;

-AutoExecutableNetwork::AutoExecutableNetwork(const SoExecutableNetworkInternal& network) :
-    _network(network) {
+AutoExecutableNetwork::AutoExecutableNetwork(const SoExecutableNetworkInternal& network, bool enablePerfCount) :
+    _network(network), _enablePerfCount(enablePerfCount) {
 }

 AutoExecutableNetwork::~AutoExecutableNetwork() = default;
@ -24,7 +22,7 @@ AutoExecutableNetwork::~AutoExecutableNetwork() = default;
 InferenceEngine::IInferRequestInternal::Ptr AutoExecutableNetwork::CreateInferRequestImpl(InputsDataMap networkInputs,
                                                                                          OutputsDataMap networkOutputs) {
    SoIInferRequestInternal inferRequest = {_network, _network->CreateInferRequest()};
-    return std::make_shared<AutoInferRequest>(_networkInputs, _networkOutputs, inferRequest);
+    return std::make_shared<AutoInferRequest>(_networkInputs, _networkOutputs, inferRequest, _enablePerfCount);
 }

 void AutoExecutableNetwork::Export(std::ostream& networkModel) {
--- a/inference-engine/src/auto_plugin/auto_exec_network.hpp
+++ b/inference-engine/src/auto_plugin/auto_exec_network.hpp
@ -19,16 +19,11 @@ namespace AutoPlugin {

 using DeviceName = std::string;

-struct DeviceInformation {
-    DeviceName deviceName;
-    std::map<std::string, std::string> config;
-};
-
 class AutoExecutableNetwork : public InferenceEngine::IExecutableNetworkInternal {
 public:
    using Ptr = std::shared_ptr<AutoExecutableNetwork>;

-    explicit AutoExecutableNetwork(const InferenceEngine::SoExecutableNetworkInternal& network);
+    explicit AutoExecutableNetwork(const InferenceEngine::SoExecutableNetworkInternal& network, bool enablePerfCount);

    void Export(std::ostream& networkModel) override;
    InferenceEngine::RemoteContext::Ptr GetContext() const override;
@ -43,6 +38,7 @@ public:

 private:
    InferenceEngine::SoExecutableNetworkInternal _network;
+    bool _enablePerfCount;
 };

 }  // namespace AutoPlugin
--- a/inference-engine/src/auto_plugin/auto_infer_request.cpp
+++ b/inference-engine/src/auto_plugin/auto_infer_request.cpp
@ -11,13 +11,23 @@ namespace AutoPlugin {

 AutoInferRequest::AutoInferRequest(const InputsDataMap&              networkInputs,
                                   const OutputsDataMap&             networkOutputs,
-                                   const SoIInferRequestInternal&    inferRequest)
+                                   const SoIInferRequestInternal&    inferRequest,
+                                   bool                              enablePerfCount)
    : IInferRequestInternal(networkInputs, networkOutputs)
-    , _inferRequest(inferRequest) {
+    , _inferRequest(inferRequest)
+    , _enablePerfCount(enablePerfCount) {
 }

 std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> AutoInferRequest::GetPerformanceCounts() const {
+    if (_enablePerfCount) {
+        try {
            return _inferRequest->GetPerformanceCounts();
+        } catch (...) {
+            return {};
+        }
+    } else {
+        return {};
+    }
 }

 void AutoInferRequest::InferImpl() {
--- a/inference-engine/src/auto_plugin/auto_infer_request.hpp
+++ b/inference-engine/src/auto_plugin/auto_infer_request.hpp
@ -24,7 +24,8 @@ public:
    using Ptr = std::shared_ptr<AutoInferRequest>;
    explicit AutoInferRequest(const InferenceEngine::InputsDataMap&             networkInputs,
                              const InferenceEngine::OutputsDataMap&            networkOutputs,
-                              const InferenceEngine::SoIInferRequestInternal&   inferRequest);
+                              const InferenceEngine::SoIInferRequestInternal&   inferRequest,
+                              bool                                              enablePerfCount);
    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> GetPerformanceCounts() const override;
    void InferImpl() override;
    void SetBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) override;
@ -37,6 +38,7 @@ public:

 private:
    InferenceEngine::SoIInferRequestInternal _inferRequest;
+    bool                                     _enablePerfCount;
 };

 }  // namespace AutoPlugin
--- a/inference-engine/src/auto_plugin/auto_plugin.cpp
+++ b/inference-engine/src/auto_plugin/auto_plugin.cpp
@ -75,11 +75,11 @@ IE::QueryNetworkResult AutoInferencePlugin::QueryNetwork(const IE::CNNNetwork& n
    }

    auto fullConfig = mergeConfigs(_config, config);
-    auto metaDevices = GetDeviceChoice(fullConfig);
+    auto metaDevices = GetDeviceList(fullConfig);
    std::unordered_set<std::string> supportedLayers;
    for (auto&& value : metaDevices) {
        try {
-            auto deviceQr = GetCore()->QueryNetwork(network, value.deviceName, value.config);
+            auto deviceQr = GetCore()->QueryNetwork(network, value, {});
            std::unordered_set<std::string> deviceSupportedLayers;
            for (auto &&layerQr : deviceQr.supportedLayersMap) {
                deviceSupportedLayers.emplace(layerQr.first);
@ -111,7 +111,19 @@ IE::Parameter AutoInferencePlugin::GetConfig(const std::string& name,

 void AutoInferencePlugin::SetConfig(const ConfigType& config) {
    for (auto && kvp : config) {
+        if (kvp.first.find("AUTO_") == 0) {
            _config[kvp.first] = kvp.second;
+        } else if (kvp.first == IE::PluginConfigParams::KEY_PERF_COUNT) {
+            if (kvp.second == IE::PluginConfigParams::YES ||
+                kvp.second == IE::PluginConfigParams::NO) {
+                _config[kvp.first] = kvp.second;
+            } else {
+                IE_THROW() << "Unsupported config value: " << kvp.second
+                           << " for key: " << kvp.first;
+            }
+        } else {
+            IE_THROW() << "Unsupported config key: " << kvp.first;
+        }
    }
 }

@ -128,7 +140,10 @@ IE::Parameter AutoInferencePlugin::GetMetric(const std::string& name,
        std::string device_name = {"Inference Engine AUTO device"};
        IE_SET_METRIC_RETURN(FULL_DEVICE_NAME, device_name);
    } else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
-        std::vector<std::string> configKeys;
+        std::vector<std::string> configKeys = {
+            IE::KEY_AUTO_DEVICE_LIST,
+            IE::PluginConfigParams::KEY_PERF_COUNT
+        };
        IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
    } else if (name == METRIC_KEY(OPTIMIZATION_CAPABILITIES)) {
        std::vector<std::string> capabilities = GetOptimizationCapabilities(options);
@ -139,42 +154,21 @@ IE::Parameter AutoInferencePlugin::GetMetric(const std::string& name,
 }

 //////////////////////////////////// private & protected functions ///////////////////
-std::vector<AutoPlugin::DeviceInformation> AutoInferencePlugin::GetDeviceChoice(const ConfigType&  config) const {
-    std::vector<DeviceInformation> metaDevices;
-    std::vector<std::string> availableDevices;
+std::vector<DeviceName> AutoInferencePlugin::GetDeviceList(const ConfigType& config) const {
+    std::vector<DeviceName> deviceList;

    auto deviceListConfig = config.find(IE::KEY_AUTO_DEVICE_LIST);
    if (deviceListConfig == config.end()) {
-        availableDevices = GetCore()->GetAvailableDevices();
+        deviceList = GetCore()->GetAvailableDevices();
    } else {
-        availableDevices = IE::DeviceIDParser::getHeteroDevices(deviceListConfig->second);
+        deviceList = IE::DeviceIDParser::getHeteroDevices(deviceListConfig->second);
    }

-    auto getDeviceConfig = [&] (const DeviceName & deviceWithID) {
-        IE::DeviceIDParser deviceParser(deviceWithID);
-        std::string deviceName = deviceParser.getDeviceName();
-        ConfigType tconfig = config;
-
-        // set device ID if any
-        std::string deviceIDLocal = deviceParser.getDeviceID();
-        if (!deviceIDLocal.empty()) {
-            tconfig[IE::PluginConfigParams::KEY_DEVICE_ID] = deviceIDLocal;
-        }
-
-        return GetSupportedConfig(tconfig, deviceName);
-    };
-
-    for (auto && d : availableDevices) {
-        if (d != _pluginName) {
-            metaDevices.push_back({ d, getDeviceConfig(d)});
-        }
-    }
-
-    if (metaDevices.empty()) {
+    if (deviceList.empty()) {
        IE_THROW() << "Please, check environment due to no supported devices can be used";
    }

-    return metaDevices;
+    return deviceList;
 }

 std::vector<std::string> AutoInferencePlugin::GetOptimizationCapabilities(const std::map<std::string, IE::Parameter> & options) const {
@ -215,7 +209,21 @@ ConfigType AutoInferencePlugin::GetSupportedConfig(const ConfigType&  config,
    return supportedConfig;
 }

-DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector<DeviceInformation>& metaDevices, const std::string& networkPrecision) {
+void AutoInferencePlugin::CheckConfig(const ConfigType& config) {
+    std::vector<std::string> supportedConfigKeys = GetMetric(METRIC_KEY(SUPPORTED_CONFIG_KEYS), {});
+    for (auto&& c : config) {
+        auto itKey = std::find(supportedConfigKeys.begin(), supportedConfigKeys.end(), c.first);
+        if (supportedConfigKeys.end() == itKey) {
+            // CVS-57233
+            if (c.first.find("AUTO_") == 0) {
+                continue;
+            }
+            IE_THROW() << "AUTO plugin doesn't support config key " << c.first;
+        }
+    }
+}
+
+DeviceName AutoInferencePlugin::SelectDevice(const std::vector<DeviceName>& metaDevices, const std::string& networkPrecision) {
    if (metaDevices.empty()) {
        IE_THROW(NotFound) << "No available device to select in AUTO plugin";
    }
@ -223,15 +231,15 @@ DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector<DeviceInfo
        return metaDevices.at(0);
    }

-    std::vector<DeviceInformation> CPU;
-    std::vector<DeviceInformation> GPU;
+    std::vector<DeviceName> CPU;
+    std::vector<DeviceName> GPU;

    for (auto& item : metaDevices) {
-        if (item.deviceName.find("CPU") == 0) {
+        if (item.find("CPU") == 0) {
            CPU.push_back(item);
            continue;
        }
-        if (item.deviceName.find("GPU") == 0) {
+        if (item.find("GPU") == 0) {
            GPU.push_back(item);
            continue;
        }
@ -242,10 +250,10 @@ DeviceInformation AutoInferencePlugin::SelectDevice(const std::vector<DeviceInfo
    }

    // Sort GPU by name: GPU.2 > GPU.1 > GPU.0 > GPU, so we always choose the GPU[0] as best device
-    std::sort(GPU.begin(), GPU.end(), [](const DeviceInformation& a, const DeviceInformation& b)->bool{return b.deviceName < a.deviceName;});
+    std::sort(GPU.begin(), GPU.end(), [](const DeviceName& a, const DeviceName& b)->bool{return b < a;});

    for (auto&& item : GPU) {
-        std::vector<std::string> capability = GetCore()->GetMetric(item.deviceName, METRIC_KEY(OPTIMIZATION_CAPABILITIES));
+        std::vector<std::string> capability = GetCore()->GetMetric(item, METRIC_KEY(OPTIMIZATION_CAPABILITIES));
        auto res = std::find(capability.begin(), capability.end(), networkPrecision);
        if (res != capability.end()) {
            return item;
--- a/inference-engine/src/auto_plugin/auto_plugin.hpp
+++ b/inference-engine/src/auto_plugin/auto_plugin.hpp
@ -30,10 +30,11 @@ public:
    void SetConfig(const ConfigType& config) override;

 private:
-    std::vector<AutoPlugin::DeviceInformation> GetDeviceChoice(const ConfigType&  config) const;
+    std::vector<DeviceName> GetDeviceList(const ConfigType&  config) const;
    std::vector<std::string> GetOptimizationCapabilities(const std::map<std::string, IE::Parameter>& options) const;
-    DeviceInformation SelectDevice(const std::vector<DeviceInformation>& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32));
-    ConfigType GetSupportedConfig(const ConfigType& config, const AutoPlugin::DeviceName & deviceName) const;
+    DeviceName SelectDevice(const std::vector<DeviceName>& metaDevices, const std::string& networkPrecision = METRIC_VALUE(FP32));
+    ConfigType GetSupportedConfig(const ConfigType& config, const DeviceName & deviceName) const;
+    void CheckConfig(const ConfigType& config);
    static ConfigType mergeConfigs(ConfigType config, const ConfigType& local);

    template <typename T>
@ -41,18 +42,21 @@ private:
        if (GetCore() == nullptr) {
            IE_THROW() << "Please, work with AUTO device via InferencEngine::Core object";
        }
+
+        CheckConfig(config);
+
        auto fullConfig = mergeConfigs(_config, config);
-        auto metaDevices = GetDeviceChoice(fullConfig);
-        DeviceInformation selectedDevice;
+        auto metaDevices = GetDeviceList(fullConfig);
+        DeviceName selectedDevice;
        IE::SoExecutableNetworkInternal executableNetwork;
        while (!metaDevices.empty()) {
            selectedDevice = SelectDevice(metaDevices, networkPrecision);
            try {
-                executableNetwork = GetCore()->LoadNetwork(param, selectedDevice.deviceName, selectedDevice.config);
+                executableNetwork = GetCore()->LoadNetwork(param, selectedDevice, {});
                break;
            } catch (...) {
                auto eraseDevice = std::find_if(metaDevices.begin(), metaDevices.end(),
-                    [=](const DeviceInformation& d)->bool{return d.deviceName == selectedDevice.deviceName;});
+                    [=](const DeviceName& d)->bool{return d == selectedDevice;});
                if (eraseDevice == metaDevices.end()) {
                    IE_THROW() << "Didn't find the selected device name";
                }
@ -63,7 +67,10 @@ private:
        if (!executableNetwork) {
            IE_THROW() << "Failed to load network by AUTO plugin";
        }
-        auto impl = std::make_shared<AutoExecutableNetwork>(executableNetwork);
+
+        bool enablePerfCount = fullConfig.find(IE::PluginConfigParams::KEY_PERF_COUNT) != fullConfig.end();
+
+        auto impl = std::make_shared<AutoExecutableNetwork>(executableNetwork, enablePerfCount);

        if (std::is_same<std::string, T>::value) {
            SetExeNetworkInfo(impl, executableNetwork->GetInputsInfo(),
--- a/inference-engine/src/cldnn_engine/cldnn_config.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@ -5,6 +5,7 @@
 #include <sys/stat.h>

 #include <cldnn/cldnn_config.hpp>
+#include <gpu/gpu_config.hpp>
 #include "cldnn_config.h"
 #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
 #include "ie_api.h"
@ -39,6 +40,7 @@ static void createDirectory(std::string _path) {
    }
 }

+IE_SUPPRESS_DEPRECATED_START
 void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Config::UpdateFromMap");
    for (auto& kvp : configMap) {
@ -69,7 +71,8 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY) == 0 ||
+                   key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY) == 0) {
            std::stringstream ss(val);
            uint32_t uVal(0);
            ss >> uVal;
@ -93,7 +96,8 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
                    IE_THROW(ParameterMismatch) << "Unsupported queue priority value: " << uVal;
            }

-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE) == 0 ||
+                   key.compare(CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE) == 0) {
            std::stringstream ss(val);
            uint32_t uVal(0);
            ss >> uVal;
@ -205,7 +209,8 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS) == 0 ||
+                   key.compare(CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS) == 0) {
            if (val.compare(PluginConfigParams::YES) == 0) {
                nv12_two_inputs = true;
            } else if (val.compare(PluginConfigParams::NO) == 0) {
@ -221,7 +226,7 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_MAX_NUM_THREADS) == 0) {
            int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
            try {
                int val_i = std::stoi(val);
@ -231,17 +236,17 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
                    n_threads = val_i;
                }
            } catch (const std::exception&) {
-                IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val
+                IE_THROW() << "Wrong value for property key " << GPUConfigParams::KEY_GPU_MAX_NUM_THREADS << ": " << val
                                   << "\nSpecify the number of threads use for build as an integer."
                                   << "\nOut of range value will be set as a default value, maximum concurrent threads.";
            }
-        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING) == 0) {
+        } else if (key.compare(GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING) == 0) {
            if (val.compare(PluginConfigParams::YES) == 0) {
                enable_loop_unrolling = true;
            } else if (val.compare(PluginConfigParams::NO) == 0) {
                enable_loop_unrolling = false;
            } else {
-                IE_THROW(ParameterMismatch) << "Unsupported KEY_CLDNN_ENABLE_LOOP_UNROLLING flag value: " << val;
+                IE_THROW(ParameterMismatch) << "Unsupported KEY_GPU_ENABLE_LOOP_UNROLLING flag value: " << val;
            }
        } else {
            IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
@ -297,6 +302,7 @@ void Config::adjustKeyMapValues() {
        default: break;
        }
        key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_PRIORITY] = qp;
+        key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY] = qp;
    }
    {
        std::string qt = "0";
@ -307,6 +313,7 @@ void Config::adjustKeyMapValues() {
        default: break;
        }
        key_config_map[CLDNNConfigParams::KEY_CLDNN_PLUGIN_THROTTLE] = qt;
+        key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE] = qt;
    }
    {
        std::string tm = PluginConfigParams::TUNING_DISABLED;
@ -328,11 +335,13 @@ void Config::adjustKeyMapValues() {
    key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
    key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
    key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
-    key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads);
+    key_config_map[GPUConfigParams::KEY_GPU_MAX_NUM_THREADS] = std::to_string(n_threads);

    if (enable_loop_unrolling)
-        key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES;
+        key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES;
    else
-        key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO;
+        key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO;
 }
+IE_SUPPRESS_DEPRECATED_END
+
 }  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -79,7 +79,7 @@
 #include "cldnn_executable_network.h"
 #include "cldnn_custom_layer.h"
 #include "cldnn_itt.h"
-#include "cldnn/cldnn_config.hpp"
+#include "gpu/gpu_config.hpp"

 #ifdef __linux__
 # include <dlfcn.h>
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@ -16,7 +16,6 @@
 #include "cldnn_itt.h"

 #include <description_buffer.hpp>
-#include <cldnn/cldnn_config.hpp>
 #include "cldnn_infer_request.h"
 #include <threading/ie_executor_manager.hpp>
 #include "cldnn_async_infer_request.h"
--- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
@ -16,7 +16,6 @@
 #include "cldnn_graph.h"
 #include "simple_math.h"
 #include <description_buffer.hpp>
-#include <cldnn/cldnn_config.hpp>
 #include "cldnn_infer_request.h"
 #include <threading/ie_executor_manager.hpp>
 #include <fstream>
--- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
+++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
@ -10,13 +10,18 @@
 namespace GNAPluginNS {
 namespace GNALimitations {

+constexpr uint32_t bufferMaxSize = 65528;
+
 constexpr uint32_t convMinFiltersNum = 4;
 constexpr uint32_t convMaxFiltersNum = 65532;
 constexpr uint32_t convFiltersNumDivider = 4;
+constexpr uint32_t convFilterMaxSize = 768;
 constexpr uint32_t convEachKernelByteAlignment = 16;
 constexpr uint32_t noOfInputsDivisor = 8;
 constexpr uint32_t noOfInputsLowPrecDivisor = 16;

+constexpr uint32_t affineMaxBatchSize = 8;
+
 namespace Cnn2D {
 struct RangeLimit {
    uint32_t min;
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@ -370,14 +370,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
            auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front();
            auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
            auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
-            auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue));

            result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
-            if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) {
-                result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax);
-            }
-            //
-            //result = MAX_VAL_2B_FEAT / absMax;
            if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
                result = max_activation_scale_factor;
            }
@ -401,6 +395,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
                auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
                if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
+                    prevLayerQuant->_src_quant.IsStatsSet() &&
                    (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
                    result = prevLayerQuant->_src_quant.GetScale();
                    usePrevScaleFactor = true;
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@ -158,7 +158,8 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer)
                THROW_GNA_LAYER_EXCEPTION(layer) << " outData["<< i << "]" << " connected by " << j <<" connection doesnt connect to functional layer";
            }

-            auto dataOutput = outFunctionalLayer.first->insData[outFunctionalLayer.second].lock();
+            for (int idx : outFunctionalLayer.second) {
+                auto dataOutput = outFunctionalLayer.first->insData[idx].lock();

                padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize())
                                                            * dataOutput->getPrecision().size();
@ -170,12 +171,13 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer)
                    size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset");
                    layerInfoItem.splitOutputLayers.emplace_back(
                        outFunctionalLayer.first,
-                    outFunctionalLayer.second,
+                        idx,
                        aligned64_offset * dataOutput->getPrecision().size(),
                        output_layer_size);
                } else {
                    layerInfoItem.splitOutputLayers.emplace_back(
-                    outFunctionalLayer.first, outFunctionalLayer.second, split_size, output_layer_size);
+                        outFunctionalLayer.first, idx, split_size, output_layer_size);
+                }
             }
        }

--- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
@ -155,14 +155,14 @@ inline InferenceEngine::CNNLayerPtr  CNNNetPrevLayerSkipCertain(Layer layer, int
 */

 template <class Layer>
-inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck,
+inline std::pair<InferenceEngine::CNNLayerPtr, std::vector<int>>  CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck,
                                                                const std::function<bool(CNNLayerPtr)> &shouldSkip) {
    if (oidx >= layer->outData.size()) {
-        if (bOnlyCheck) return {nullptr, 0};
+        if (bOnlyCheck) return {nullptr, {}};
        THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx;
    }
    if (getInputTo(layer->outData[oidx]).empty() || iidx >= getInputTo(layer->outData[oidx]).size()) {
-        if (bOnlyCheck) return {nullptr, 0};
+        if (bOnlyCheck) return {nullptr, {}};
        THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx << " and inputTo index: " << iidx;
    }

@ -174,12 +174,12 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer

    while (shouldSkip(outLayer->second)) {
        if (outLayer->second->outData.size() <= new_oidx) {
-            if (bOnlyCheck) return { nullptr, 0 };
+            if (bOnlyCheck) return { nullptr, {} };
            THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx;
        }

        if (getInputTo(outLayer->second->outData[new_oidx]).size() <= new_iidx) {
-            if (bOnlyCheck) return { nullptr, 0 };
+            if (bOnlyCheck) return { nullptr, {} };
            THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx << " and inputTo index: " << new_iidx;
        }

@ -188,11 +188,7 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer
    }

    auto insDataIdx = CNNLayerFindInsDataIdxes(layer->outData[new_oidx], outLayer->second);
-    if (insDataIdx.size() != 1) {
-        if (bOnlyCheck) return { nullptr, 0 };
-        THROW_GNA_LAYER_EXCEPTION(layer) << " has multiple connection to " << new_oidx << " outData";
-    }
-    return { outLayer->second, insDataIdx.front() };
+    return { outLayer->second, insDataIdx };
 }

 /**
@ -256,7 +252,7 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer

 /// @brief alias for strict checkNextLayer (false)
 template <class Layer>
-inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx,
+inline std::pair<InferenceEngine::CNNLayerPtr, std::vector<int>>  CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx,
                                                                               const std::function<bool(CNNLayerPtr)> &shouldSkip) {
    return CNNNetCheckNextLayerSkipCertain(layer, oidx, iidx, false, shouldSkip);
 }
--- a/inference-engine/src/gna_plugin/gna_groups.hpp
+++ b/inference-engine/src/gna_plugin/gna_groups.hpp
@ -46,14 +46,10 @@ inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input
 * @param layer
 */
 inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
-    if (GNAPluginNS::LayerInfo(layer).isPower())
+    if (GNAPluginNS::LayerInfo(layer).isPower() || GNAPluginNS::LayerInfo(layer).isCopy())
        return true;

-    if (!GNAPluginNS::LayerInfo(layer).isScaleShift())
-        return false;
-
-    // Don't reshape user-defined ScaleShift layers
-    if (layer->name.rfind("SyntheticScaleShift", 0) == std::string::npos)
+    if (!GNAPluginNS::LayerInfo(layer).isSyntheticScaleShift())
        return false;

    // Don't reshape the first dnn layer since it breaks groups recognition
@ -61,8 +57,7 @@ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
        return LayerInfo(ptr).isNonValuesChangable();
    });
    IE_ASSERT(prevLayer != nullptr);
-    if (LayerInfo(prevLayer).isInput())
-        return false;
+    if (LayerInfo(prevLayer).isInput()) return false;

    // Don't reshape diagonallayers with bias connection
    return !GNAPluginNS::LayerInfo(getCreatorLayer(layer->insData.front().lock()).lock()).has32BOutput();
--- a/inference-engine/src/gna_plugin/gna_model_serial.cpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp
@ -17,6 +17,7 @@
 #include <mm_malloc.h>
 #include <serial/headers/2dot2/gna_model_header.hpp>
 #include <serial/headers/2dot5/gna_model_header.hpp>
+#include <serial/headers/2dot6/gna_model_header.hpp>

 #endif

@ -133,10 +134,11 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
                }
                case 5:
                case 6:
+                case 7:
                    readNBytes(&header, sizeof(HeaderLatest::ModelHeader), is);
                    break;
                default:
-                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 4 and is: " << header.version.minor;
+                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << header.version.minor;
            }
            break;
        default:
@ -154,6 +156,40 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
    return header;
 }

+GNAPluginNS::HeaderLatest::RuntimeEndPoint GNAModelSerial::ReadEndPoint(std::istream &is) {
+    is.exceptions(std::istream::failbit);
+
+    HeaderLatest::RuntimeEndPoint endPoint;
+    switch (modelHeader.version.major) {
+        case 2:
+            switch (modelHeader.version.minor) {
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                {
+                    Header2dot6::RuntimeEndPoint tempEndPoint2dot6;
+                    readBits(tempEndPoint2dot6, is);
+                    endPoint = HeaderLatest::RuntimeEndPoint(tempEndPoint2dot6, modelHeader.nGroup);
+                    break;
+                }
+                case 7:
+                    readNBytes(&endPoint, sizeof(HeaderLatest::RuntimeEndPoint), is);
+                    break;
+                default:
+                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << modelHeader.version.minor;
+            }
+            break;
+        default:
+            THROW_GNA_EXCEPTION << "Imported file unsupported. Import for files with major version equal to: "
+            << modelHeader.version.major << " is not implemented";
+    }
+
+    return endPoint;
+}
+
 #define offsetFromBase(field)\
 getOffsetFromBase(field, #field)

@ -324,18 +360,6 @@ void GNAModelSerial::Import(void *basePointer,
    is.read(reinterpret_cast<char*>(basePointer), gnaGraphSize);
 }

-
-uint32_t guessGrouping(Gna2Model const& model) {
-    if (model.NumberOfOperations == 0 ||
-        model.Operations == nullptr ||
-        model.Operations[0].Operands == nullptr ||
-        model.Operations[0].NumberOfOperands == 0 ||
-        model.Operations[0].Operands[0]->Shape.NumberOfDimensions < 2) {
-        THROW_GNA_EXCEPTION << "Can not guess grouping";
-    }
-    return (std::min)(model.Operations[0].Operands[0]->Shape.Dimensions[0], model.Operations[0].Operands[0]->Shape.Dimensions[1]);
-}
-
 void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const {
    os.exceptions(std::ostream::failbit);

@ -366,6 +390,9 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
        out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
        out.scaleFactor = ep.scaleFactor;
        out.element_size = ep.element_size;
+        out.shape = ep.shape;
+        out.layout = ep.layout;
+        out.precision = ep.precision;
        out.orientation = ep.orientation;
        return out;
    };
@ -381,7 +408,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
    header.headerSize = sizeof(HeaderLatest::ModelHeader);
    header.gnaMemSize = gnaGraphSize;
    header.layersCount = layers.size();
-    header.nGroup = guessGrouping(*gna2Model);
+    header.nGroup = 1; // just to support the old models
    header.nInputs = inputs.size();
    header.nOutputs = outputs.size();
    header.nTransposeInputs = transposeInputsInfo.size();
@ -796,13 +823,22 @@ std::vector<HeaderLatest::RuntimeEndPoint> GNAModelSerial::serializeOutputs(cons
    std::size_t outputIndex = 0;
    for (auto const &output : outputsDataMap) {
        auto outputName = output.first;
-        auto inputDims = output.second->getTensorDesc().getDims();
-        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(inputDims.begin(), inputDims.end()));
-
+        auto outputDims = output.second->getTensorDesc().getDims();
+        HeaderLatest::RuntimeEndPoint::Shape outputShape;
+        outputShape.NumberOfDimensions = outputDims.size();
+        for (size_t i=0; i < outputShape.NumberOfDimensions; ++i) {
+            outputShape.Dimensions[i] = static_cast<uint32_t>(outputDims[i]);
+        }
+        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(outputDims.begin(), outputDims.end()));
+        InferenceEngine::Layout outputLayout = output.second->getLayout();
+        InferenceEngine::Precision::ePrecision outputPrecision = InferenceEngine::Precision::FP32;
        HeaderLatest::RuntimeEndPoint endPoint(outputsDesc[outputIndex].scale_factor,
                                                 outputsDesc[outputIndex].ptrs[0],
                                                 outputsDesc[outputIndex].num_bytes_per_element,
                                                 elementsCount,
+                                                 outputShape,
+                                                 outputLayout,
+                                                 outputPrecision,
                                                 outputsDesc[outputIndex].orientation);
        endPoints.push_back(endPoint);
        outputIndex++;
@ -818,18 +854,26 @@ std::vector<HeaderLatest::RuntimeEndPoint> GNAModelSerial::serializeInputs(const
    for (auto const& input : inputsDataMap) {
        auto inputName = input.first;
        auto inputDims = input.second->getTensorDesc().getDims();
-
+        HeaderLatest::RuntimeEndPoint::Shape inputShape;
+        inputShape.NumberOfDimensions = inputDims.size();
+        for (size_t i=0; i < inputShape.NumberOfDimensions; ++i) {
+            inputShape.Dimensions[i] = static_cast<uint32_t>(inputDims[i]);
+        }
        double scaleFactor = inputDesc->getScaleFactor(inputIndex);
        std::vector<void *> descriptor_ptr = inputDesc->getPtrInputsGlobal(inputName);
        IE_ASSERT(descriptor_ptr.size() > 0);
        uint32_t element_size = 2u;
        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(inputDims.begin(), inputDims.end()));
        intel_dnn_orientation_t orientation = inputDesc->getOrientation(inputName);
-
+        InferenceEngine::Layout inputLayout = input.second->getLayout();
+        InferenceEngine::Precision::ePrecision inputPrecision = InferenceEngine::Precision::FP32;
        HeaderLatest::RuntimeEndPoint endPoint(scaleFactor,
                                                 descriptor_ptr[0],
                                                 element_size,
                                                 elementsCount,
+                                                 inputShape,
+                                                 inputLayout,
+                                                 inputPrecision,
                                                 orientation);
        endPoints.push_back(endPoint);
        inputIndex++;
@ -846,20 +890,24 @@ void GNAModelSerial::ImportInputs(std::istream &is,
    for (uint32_t inputIndex = 0; inputIndex < modelHeader.nInputs; inputIndex++) {
        const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3)
                ? inputNames.at(inputIndex) : std::string("input" + std::to_string(inputIndex));
-        HeaderLatest::RuntimeEndPoint input;
-        is.read(reinterpret_cast<char *>(&input), sizeof(input));
+
+        HeaderLatest::RuntimeEndPoint input = ReadEndPoint(is);
        inputsDesc->getPtrInputsGlobal(name).push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + input.descriptor_offset));
        inputsDesc->orientation_in[name] = input.orientation;
        inputsDesc->bytes_allocated_for_input[name] = input.element_size * input.elements_count;

-        auto inputDims = InferenceEngine::SizeVector({modelHeader.nGroup, input.elements_count / modelHeader.nGroup});
-
+        auto inputDims = InferenceEngine::SizeVector();
+        for (auto i = 0; i < input.shape.NumberOfDimensions; ++i) {
+            inputDims.push_back(input.shape.Dimensions[i]);
+        }
+        InferenceEngine::Layout inputLayout = static_cast<InferenceEngine::Layout>(input.layout);
+        InferenceEngine::Precision inputPresicion = InferenceEngine::Precision(static_cast<InferenceEngine::Precision::ePrecision>(input.precision));
        dataMap[name] = std::make_shared<InferenceEngine::InputInfo>();
        dataMap[name]->setInputData(std::make_shared<InferenceEngine::Data>(name,
                                                            InferenceEngine::TensorDesc(
-                                                                    InferenceEngine::Precision::FP32,
+                                                                    inputPresicion,
                                                                    inputDims,
-                                                                    InferenceEngine::Layout::NC)));
+                                                                    inputLayout)));
        inputsDesc->inputScaleFactors.push_back(input.scaleFactor);
    }
 }
@ -875,8 +923,8 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
    for (uint32_t outputIndex = 0; outputIndex < modelHeader.nOutputs; outputIndex++) {
        const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3)
                                  ? outputNames.at(outputIndex) : std::string("output" + std::to_string(outputIndex));
-        HeaderLatest::RuntimeEndPoint output;
-        is.read(reinterpret_cast<char *>(&output), sizeof(output));
+
+        HeaderLatest::RuntimeEndPoint output = ReadEndPoint(is);
        OutputDesc description;
        description.ptrs.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + output.descriptor_offset));
        description.orientation = kDnnInterleavedOrientation;
@ -884,12 +932,17 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
        description.num_bytes_per_element = output.element_size;
        description.scale_factor = output.scaleFactor;

-        auto outputDims = InferenceEngine::SizeVector({modelHeader.nGroup, output.elements_count / modelHeader.nGroup});
+        auto outputDims = InferenceEngine::SizeVector();
+        for (auto i = 0; i < output.shape.NumberOfDimensions; ++i) {
+            outputDims.push_back(output.shape.Dimensions[i]);
+        }
+        InferenceEngine::Layout outputLayout = static_cast<InferenceEngine::Layout>(output.layout);
+        InferenceEngine::Precision outputPresicion =  InferenceEngine::Precision(static_cast<InferenceEngine::Precision::ePrecision>(output.precision));
        dataMap[name] = std::make_shared<InferenceEngine::Data>(name,
                                                 InferenceEngine::TensorDesc(
-                                                         InferenceEngine::Precision::FP32,
+                                                         outputPresicion,
                                                         outputDims,
-                                                         InferenceEngine::Layout::NC));
+                                                         outputLayout));
        desc.at(outputIndex) = description;
    }
 }
--- a/inference-engine/src/gna_plugin/gna_model_serial.hpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp
@ -138,6 +138,8 @@ private:
     */
    static GNAPluginNS::HeaderLatest::ModelHeader ReadHeader(std::istream &is);

+    GNAPluginNS::HeaderLatest::RuntimeEndPoint ReadEndPoint(std::istream &is);
+
    /**
     * @brief Import model from FS into preallocated buffer,
     * buffers for pLayers, and pStructs are allocated here and required manual deallocation using mm_free
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@ -54,12 +54,17 @@
 #include <transformations/common_optimizations/pull_transpose_through_fq.hpp>
 #include <transformations/common_optimizations/relu_fake_quantize_fusion.hpp>
 #include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
+#include <transformations/utils/utils.hpp>

 #include "transformations/remove_extra_reshapes.hpp"
 #include "transformations/insert_transpose_after_convolution_or_pooling.hpp"
 #include "transformations/insert_transpose_before_matmul.hpp"
 #include "transformations/reorder_activation_and_pooling.hpp"
 #include "transformations/swap_input_matmul_gna.hpp"
+#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
+#include "transformations/split_convolution_with_large_buffer_size.hpp"
+
+#include <ngraph/opsets/opset7.hpp>

 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
@ -667,6 +672,15 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        // WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
        manager.register_pass<ngraph::pass::ConvertPriorBox>();
        manager.register_pass<ngraph::pass::CommonOptimizations>();
+        // TODO enable this transformation for networks with convolutions
+        if (!ngraph::op::util::has_op_with_type<ngraph::opset7::Convolution>(graph)) {
+            manager.register_pass<ConvertMatmulWithFqToPointWiseConvolution>();
+            manager.register_pass<ConvertMatmulWithBiasToPointWiseConvolution>();
+            manager.register_pass<ConvertMatmulToPointWiseConvolution>();
+        }
+        manager.register_pass<SplitConvolutionWithFq>();
+        manager.register_pass<SplitConvolutionWithBias>();
+        manager.register_pass<SplitConvolution>();
        manager.register_pass<InsertTransposeBeforeMatmul>();
        manager.register_pass<SwapInputMatMul>();
        manager.register_pass<InsertTransposeAfterConvOrPool>();
@ -735,6 +749,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        passes->registerPass<SubstitutePReluPass>();
        passes->registerPass<SubstituteSoftSignPass>();

+        passes->registerPass<BroadcastConstPass>();
        passes->registerPass<ReorderMaxPoolPass>();
        passes->registerPass<EltwiseSplitOverChannelsPass>();
        passes->registerPass<InsertSplitAligningFilterPass>();
@ -753,7 +768,6 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {

        passes->registerPass<InsertIdentityLayerPass>();
        passes->registerPass<BreakFusingOfOutputLayersPass>();
-        passes->registerPass<BroadcastConstPass>();
        passes->registerPass<InsertDiagonalLayerPass>();
        passes->registerPass<HandleMultipleActivationsForTheLayerPass>();
 #if GNA_LIB_VER == 2
@ -1465,7 +1479,11 @@ static InferenceEngine::Layout GetLayoutForDims(const InferenceEngine::SizeVecto
 Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) {
    // need to have intermediate blob for interleave conversion
    InferenceEngine::Blob::Ptr outputBlob;
-    auto outputDims = outputsDataMap[name]->getTensorDesc().getDims();
+    auto outputDataIt = outputsDataMap.find(name);
+    if (outputDataIt == std::end(outputsDataMap)) {
+        THROW_GNA_EXCEPTION << "Output " << name << " isn't found";
+    }
+    auto outputDims = outputDataIt->second->getTensorDesc().getDims();
    outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, GetLayoutForDims(outputDims)));
    outputBlob->allocate();
    return outputBlob;
@ -1475,7 +1493,11 @@ Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Prec
    InferenceEngine::Blob::Ptr inputBlob;
    // need to have intermediate blob for interleave conversion
    // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
-    auto inputDims = inputsDataMap[name]->getTensorDesc().getDims();
+    auto inputDataIt = inputsDataMap.find(name);
+    if (inputDataIt == std::end(inputsDataMap)) {
+        THROW_GNA_EXCEPTION << "Input " << name << " isn't found";
+    }
+    auto inputDims = inputDataIt->second->getTensorDesc().getDims();
    inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, GetLayoutForDims(inputDims)));
    inputBlob->allocate();
    return inputBlob;
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@ -86,7 +86,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
    });
    IE_ASSERT(inputLayer != nullptr);
    size_t weightsSize = (LayerInfo(prevLayer).has32BOutput() || LayerInfo(inputLayer).isInput()) ?
-                         weightsSize = nextLayer->outData[0]->getDims().back() :
+                         nextLayer->outData[0]->getDims().back() :
                         Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
    std::vector<float> weightsValues(weightsSize, fillValue);
    IE_ASSERT(diagLayer != nullptr);
@ -314,6 +314,7 @@ void HandleMultipleActivationsForTheLayerPass::run() {
                LayerInfo info(inputTo.second);

                if (info.isActivation()) {
+                    if (odata->getDims().empty()) continue;
                    if (!activations.empty() && odata->getDims()[0] != 1) {
                        THROW_GNA_EXCEPTION << "Unsupported batch size " << odata->getDims()[0]
                                            << " for diagonal layer insertion";
@ -741,12 +742,17 @@ void RemovePermutationsNHWCToNCHWPass::run() {
        IE_ASSERT(!input_to.empty());
        auto current_layer = input_to.begin()->second;
        setNHWCOrder(current_layer->input());
-        while (current_layer != pattern_end) {
-            setNHWCOrder(current_layer->outData[0]);
-            input_to = getInputTo(current_layer->outData[0]);
+        std::function<void(CNNLayerPtr)> propogateNHWCOrderRecursive =
+            [pattern_end, &propogateNHWCOrderRecursive, &setNHWCOrder](CNNLayerPtr current_layer) {
+            if (current_layer == pattern_end) return;
+            for (size_t i = 0; i < current_layer->outData.size(); ++i) {
+                setNHWCOrder(current_layer->outData[i]);
+                auto input_to = getInputTo(current_layer->outData[i]);
                IE_ASSERT(!input_to.empty());
-            current_layer = input_to.begin()->second;
+                propogateNHWCOrderRecursive(input_to.begin()->second);
            }
+        };
+        propogateNHWCOrderRecursive(current_layer);

        if (LayerInfo(pattern_start).isPermute() && !getInputTo(pattern_start->outData.front()).empty()) {
            auto layer_before_permute = CNNNetPrevLayer(pattern_start);
@ -1447,21 +1453,19 @@ void EltwiseSplitOverChannelsPass::run() {
            THROW_GNA_LAYER_EXCEPTION(l) << "number of outputs expected to be 1";
        }
        auto oData = l->outData.front();
+        auto out_width = GetDataDimSize(oData, DataDimName::W);
        auto totalElementsForOutput = details::product(oData->getDims().begin(), oData->getDims().end());
        auto maxAffineElements = getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo;
        if (totalElementsForOutput <= maxAffineElements) {
            continue;
        }

-        // TODO: for now lets put split of 2 elements as restrictions
        auto totalSplits = 1 + totalElementsForOutput / maxAffineElements;
-        if (totalSplits > 2) {
-            THROW_GNA_LAYER_EXCEPTION(l) << "split layer over output channels on more than 2 layers unsupported";
-        }

        pass_trace() << "transforming " << LAYER_NAME(l) << " by splitting it to multiple eltwise operations\n";
        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);

+        bool sameInputs = l->insData[0].lock() == l->insData[1].lock();
        std::vector<CNNLayerPtr> splitLayers(2);
        for (size_t kThEltwiseInput = 0; kThEltwiseInput != 2; kThEltwiseInput++) {
            // create split layer
@ -1472,31 +1476,38 @@ void EltwiseSplitOverChannelsPass::run() {

            split->insData.push_back(l->insData[kThEltwiseInput]);
            auto inputDesc = l->insData[kThEltwiseInput].lock()->getTensorDesc();
-            // need to split this desc
-            if (inputDesc.getLayout() != Layout::NC) {
-                THROW_GNA_LAYER_EXCEPTION(l)
-                << "cannot split over channel: input " << std::to_string(kThEltwiseInput)
-                << " layout need to be NC";
-            }

            // create split layer outputs
-            for (size_t i = 0;; i++) {
-                auto elements_num = std::min(totalElementsForOutput - i * maxAffineElements,
+            size_t usedElements = 0;
+            for (size_t i = 0; i < totalSplits; i++) {
+                SizeVector newDims;
+                size_t elements_num = std::min(totalElementsForOutput - usedElements,
                        static_cast<size_t>(maxAffineElements));
+                if (inputDesc.getDims().size() == 2) {
+                    newDims = SizeVector{1, elements_num};
+                } else {
+                    elements_num = elements_num - elements_num % out_width;
+                    newDims = SizeVector{1, elements_num / out_width, out_width};
+                }

-                SizeVector newDims = {1, elements_num};
                auto newDesc = TensorDesc(inputDesc.getPrecision(), newDims, inputDesc.getLayout());
                auto data = std::make_shared<Data>(l->name + "/" + std::to_string(kThEltwiseInput) + "/1", newDesc);
                getCreatorLayer(data) = split;
                split->outData.push_back(data);

-                if (elements_num != maxAffineElements) {
+                usedElements += elements_num;
+                if (usedElements == totalElementsForOutput) {
                    break;
                }
            }
            // replacing connection X->eltwise to X->split
            auto oData = CNNLayerFindOutData(l, kThEltwiseInput);
            oData.second->second = split;
+
+            if (sameInputs) {
+                splitLayers[1] = splitLayers[0];
+                break;
+            }
        }

        // create concatlayer
@ -1507,8 +1518,6 @@ void EltwiseSplitOverChannelsPass::run() {
        concat->outData.push_back(masterEltwise->outData.front());
        getCreatorLayer(masterEltwise->outData.front()) = concat;

-
-        // create new eltwise layers - here 2 hardcode
        for (size_t k = 0; k != totalSplits; k++) {
            auto eltwiseRaw = std::make_shared<EltwiseLayer>(
                    LayerParams{l->name + "/eltwise/" + std::to_string(k), "Eltwise", Precision::FP32});
@ -1517,7 +1526,6 @@ void EltwiseSplitOverChannelsPass::run() {
            eltwiseRaw->coeff = masterEltwise->coeff;
            auto eltwise = quantized ? InferenceEngine::injectData<QuantizedLayerParams>(eltwiseRaw) : eltwiseRaw;

-
            eltwise->insData.push_back(splitLayers[0]->outData[k]);
            eltwise->insData.push_back(splitLayers[1]->outData[k]);
            getInputTo(splitLayers[0]->outData[k])[eltwise->name] = eltwise;
@ -1529,6 +1537,15 @@ void EltwiseSplitOverChannelsPass::run() {
            auto data = std::make_shared<Data>(l->name + "/elwise/out/" + std::to_string(k), newDesc);
            getCreatorLayer(data) = eltwise;
            eltwise->outData.push_back(data);
+            if (quantized) {
+                auto eltwiseQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(eltwise);
+                if (quantized->_src_quant.IsStatsSet()) {
+                    eltwiseQuant->_src_quant.CopyStats(quantized->_src_quant);
+                }
+                if (quantized->_dst_quant.IsStatsSet()) {
+                    eltwiseQuant->_dst_quant.CopyStats(quantized->_dst_quant);
+                }
+            }
            getInputTo(data)[concat->name] = concat;
            concat->insData.push_back(data);
        }
@ -1919,13 +1936,20 @@ void FuseFQIntoWeightsPass::run() {
        }

        GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
-        size_t layers_connected_to_fq_count = getInputTo(fqLayer->outData[0]).size();
+        auto inputTo = getInputTo(fqLayer->outData[0]);
+        size_t layers_connected_to_fq_count = inputTo.size();
+        auto layerBeforeWeightable = fqLayer;
+        while (layers_connected_to_fq_count == 1 && LayerInfo(inputTo.begin()->second).isNonFunctional()) {
+            layerBeforeWeightable = inputTo.begin()->second;
+            inputTo = getInputTo(layerBeforeWeightable->outData[0]);
+            layers_connected_to_fq_count = inputTo.size();
+        }
        for (int index = 0; index < layers_connected_to_fq_count; index++) {
-            auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, index, isNonFunctional).first;
+            auto weightableLayer = CNNNetGetNextLayerSkipCertain(layerBeforeWeightable, 0, index, isNonFunctional).first;
            if (!LayerInfo(weightableLayer).isWeightable()) {
                continue;
            }
-            if (weightableLayer->insData.size() != 3) {
+            if (weightableLayer->insData.size() < 2) {
                continue;
            }

@ -1942,7 +1966,8 @@ void FuseFQIntoWeightsPass::run() {
            pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
                << LAYER_NAME(weightableLayer) << "\n";

-            auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx);
+            auto biases = weightableLayer->insData.size() == 3 ?
+                LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx) : nullptr;
            auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();

            // 1. broke existing connections - by detaching fq subgraph from rest of graph
@ -2149,8 +2174,11 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        }
        GNAFakeQuantizeLayer fqLayer(l);
        auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
-        if (prevLayer->outData.size() != 1) {
-            THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
+        auto prevDataIt = std::find_if(std::begin(prevLayer->outData), std::end(prevLayer->outData), [l](DataPtr data) {
+            return getInputTo(data).find(l->name) != std::end(getInputTo(data));
+        });
+        if (prevDataIt == std::end(prevLayer->outData)) {
+            THROW_GNA_LAYER_EXCEPTION(fqLayer) << "Invalid connection between " << prevLayer->name << " and " << l->name;
        }

        auto inputRange = fqLayer.getInputRange();
@ -2181,8 +2209,18 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        quantParamsPrevLayer->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
        quantParamsPrevLayer->_dst_quant.SetMaxValues({ outputRange.second[0] }, false);

+        // Propogate destination statistics to multiply layer if it's set for the next sum/sub layer (is considered as bias)
+        if (LayerInfo(prevLayer).isEltwiseSum() || LayerInfo(prevLayer).isEltwiseSub()) {
+            auto eltwPrevLayer = CNNNetPrevLayerSkipCertain(prevLayer, 0, donotSkip);
+            auto constLayer = CNNNetPrevLayerSkipCertain(prevLayer, 1, donotSkip);
+            if (LayerInfo(eltwPrevLayer).isEltwise() && LayerInfo(constLayer).isConst()) {
+                auto quantParamsEltwLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(eltwPrevLayer);
+                quantParamsEltwLayer->_dst_quant.CopyStats(quantParamsPrevLayer->_dst_quant);
+            }
+        }
+
        auto fqQauntParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
-        fqQauntParams->_dst_quant.SetLevels(fqLevels);
+        fqQauntParams->_dst_quant.SetLevels(UINT16_MAX);
        fqQauntParams->_dst_quant.SetMinValues({ inputRange.first[0] }, true);
        fqQauntParams->_dst_quant.SetMaxValues({ inputRange.second[0] }, true);
        fqQauntParams->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
@ -2198,7 +2236,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        // FQ Layer is fused only when previous layer is const, memory or activation layer
        // or a next layer is activation layer.
        bool isFQFuseAllowed = allowFQFuse(l);
-        auto prevData = prevLayer->outData.front();
+        auto prevData = *prevDataIt;

        // Find all output layers connected to FQ
        auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
@ -2207,7 +2245,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        }

        if (isFQFuseAllowed) {
-            getInputTo(prevLayer->outData.front()).clear();
+            getInputTo(prevData).clear();
        }

        // Connect all next layers after FQ to the layer that is before FQ
@ -2222,7 +2260,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
                for (int insDataIdx : insDatas) {
                    nextLayers[i]->insData[insDataIdx] = prevData;
                }
-                getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
+                getInputTo(prevData)[nextLayers[i]->name] = nextLayers[i];
            }

            propagateStatistics(quantParamsPrevLayer, nextLayers[i]);
--- a/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp
@ -0,0 +1,197 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include "backend/dnn_types.h"
+#include "serial/headers/2dot4/gna_model_header.hpp"
+#include "serial/headers/2dot6/gna_model_header.hpp"
+#include "serial/headers/latest/gna_model_header.hpp"
+#include "gna_data_types.hpp"
+
+#pragma pack(push, 1)
+
+namespace GNAPluginNS {
+namespace Header2dot7 {
+
+/**
+ Maximal number of supported shape dimensions.
+ */
+#define GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS 8
+
+/**
+ * @brief Header version 2.7
+ */
+struct ModelHeader {
+    /**
+     *@brief MagicNumber – GNAM in ascii table, equals to hex 0x474e414d
+     */
+    char gnam[4] = {};
+    /**
+     * @brief if header size is not equal to sizeof ModelHeader - some reserved data append in the end of header
+     * usually it is an indicator of working with version of model different that is current export function produce
+     */
+    uint32_t headerSize = 0u;
+    struct Version {
+        /**
+         * @details Version of format Major – unsigned int, ex: 0x0001
+         * every change in the header or in the layers definition should be reflected in version change
+         * for backward compatibility new parsers can read old versions of model with certain restrictions
+         */
+        uint16_t major = 2u;
+        /**
+         * @details Version of Format Minor – unsigned int,  corresponding to build revision for example
+         * changes in minor version are not affected layout of model
+         */
+        uint32_t minor = 7u;
+    } version;
+    /**
+     * @brief Memory required to be allocated using GNAAlloc()
+     */
+    uint64_t gnaMemSize = 0ull;
+    /**
+     * @brief Number of GNA Layers
+     */
+    uint64_t layersCount = 0ull;
+    /**
+     * @brief Grouping level
+     * This is depricted field and used for old models only (<=2.6)
+     */
+    uint32_t nGroup = 0u;
+
+    /**
+     * Convolution related setting - they are affecting input transformation
+     */
+    uint32_t nRotateRows = 0u;
+    uint32_t nRotateColumns = 0u;
+    bool doRotateInput = false;
+
+    uint32_t nInputs = 0u;
+    uint32_t nOutputs = 0u;
+
+    /**
+     * Convolution related setting - they are affecting output transformation
+     */
+    uint32_t nRotateOutputRows = 0u;
+    uint32_t nRotateOutputColumns = 0u;
+    bool doRotateOutput = false;
+
+    uint32_t nTransposeInputs = 0u;
+    uint32_t nTransposeOutputs = 0u;
+
+    /**
+     * Reserved Data might be here
+     */
+    ModelHeader() = default;
+    ModelHeader(GNAPluginNS::Header2dot1::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        version.minor = old.version.minor;
+    }
+    ModelHeader(GNAPluginNS::Header2dot4::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        nRotateOutputRows = old.nRotateOutputRows;
+        nRotateOutputColumns = old.nRotateOutputColumns;
+        doRotateOutput = old.doRotateOutput;
+        version.minor = old.version.minor;
+    }
+};
+#pragma pack(pop)
+
+/*
+ * In runtime endpoint mostly same as in serial version, except of descriptor field
+ */
+struct RuntimeEndPoint {
+    /**
+     * if scale factor is different then pased into infer , network might need to be requantized
+     */
+    float scaleFactor = 0;
+    /**
+     * Pointer descriptor
+     */
+    void* descriptor_ptr = nullptr;
+    /**
+     * Endpoint resolution in bytes.
+     */
+    uint32_t element_size = 0;
+    /**
+     * Number of elements
+     */
+    uint32_t elements_count = 0;
+    /**
+     * Offset in bytes of pointer descriptor
+    */
+    uint64_t descriptor_offset = 0ull;
+    /**
+     Shape specifying dimension values.
+    */
+    struct Shape {
+        /**
+         Number of dimensions or rank or order.
+        */
+        uint32_t NumberOfDimensions = 0;
+        /**
+         array specifying value of each dimension.
+        Set all zeros for scalars.
+        */
+        uint32_t Dimensions[GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS] = {0};
+    } shape;
+    /**
+     * Blob layout
+     */
+    uint8_t layout = InferenceEngine::Layout::NC;
+    /**
+     * Blob precision
+     */
+    uint8_t precision = InferenceEngine::Precision::FP32;
+
+    intel_dnn_orientation_t orientation = kDnnUnknownOrientation;
+
+    RuntimeEndPoint() = default;
+    RuntimeEndPoint(const GNAPluginNS::Header2dot6::RuntimeEndPoint &old, uint32_t ngroup) {
+        scaleFactor = old.scaleFactor;
+        descriptor_ptr = old.descriptor_ptr;
+        element_size = old.element_size;
+        elements_count = old.elements_count;
+        orientation = old.orientation;
+        layout = InferenceEngine::Layout::NC;
+        precision = InferenceEngine::Precision::FP32;
+        descriptor_offset = old.descriptor_offset;
+        InferenceEngine::SizeVector dims = {ngroup, elements_count / ngroup};
+        shape.NumberOfDimensions = static_cast<uint32_t>(dims.size());
+        for (auto i = 0; i < dims.size(); i++) {
+            shape.Dimensions[i] = dims[i];
+        }
+    }
+    RuntimeEndPoint(double scaleFactor,
+                    void* descriptor_ptr,
+                    uint32_t element_size,
+                    uint32_t elements_count,
+                    Shape shape,
+                    uint8_t layout,
+                    uint8_t precision,
+                    intel_dnn_orientation_t orientation) : scaleFactor(scaleFactor),
+                                                           descriptor_ptr(descriptor_ptr),
+                                                           element_size(element_size),
+                                                           elements_count(elements_count),
+                                                           shape(shape),
+                                                           layout(layout),
+                                                           precision(precision),
+                                                           orientation(orientation) { }
+};
+} // namespace Header2dot7
+} // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
@ -4,11 +4,11 @@

 #pragma once

-#include "serial/headers/2dot6/gna_model_header.hpp"
+#include "serial/headers/2dot7/gna_model_header.hpp"

 namespace GNAPluginNS {
 namespace HeaderLatest {
-using ModelHeader = GNAPluginNS::Header2dot6::ModelHeader;
-using RuntimeEndPoint = GNAPluginNS::Header2dot6::RuntimeEndPoint;
+using ModelHeader = GNAPluginNS::Header2dot7::ModelHeader;
+using RuntimeEndPoint = GNAPluginNS::Header2dot7::RuntimeEndPoint;
 }
 }
--- a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp
+++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp
@ -0,0 +1,180 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "layers/gna_permute.hpp"
+#include "backend/gna_limitations.hpp"
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(ConvertMatmulToPointWiseConvolution, "ConvertMatmulToPointWiseConvolution", 0);
+NGRAPH_RTTI_DEFINITION(ConvertMatmulWithBiasToPointWiseConvolution, "ConvertMatmulWithBiasToPointWiseConvolution", 0);
+NGRAPH_RTTI_DEFINITION(ConvertMatmulWithFqToPointWiseConvolution, "ConvertMatmulWithFqToPointWiseConvolution", 0);
+
+static std::tuple<bool, uint32_t, uint32_t, uint32_t> VerifyAndGetConvParams(std::shared_ptr<ngraph::Node> matmul_node) {
+    auto input1_shape = matmul_node->get_input_shape(0);
+    auto input2_shape = matmul_node->get_input_shape(1);
+    auto output_shape = matmul_node->get_output_shape(0);
+    if (input1_shape.size() == 3 && input1_shape.front() == 1) {
+        input1_shape.erase(std::begin(input1_shape));
+    }
+
+    if (input1_shape.size() != 2 || input2_shape.size() != 2 || output_shape.size() < 2) {
+        return std::make_tuple(false, 0, 0, 0);
+    }
+
+    // Check if MatMul or corresponding pointwise convolution are supported by GNA
+    const uint32_t width = input1_shape.front();
+    const uint32_t in_channels = input2_shape.back();
+    const uint32_t out_channels = input2_shape.front();
+    if (input1_shape.front() <= GNALimitations::affineMaxBatchSize ||
+        out_channels % GNALimitations::convFiltersNumDivider != 0 ||
+        out_channels > GNALimitations::convMaxFiltersNum ||
+        in_channels > GNALimitations::convFilterMaxSize) {
+        return std::make_tuple(false, 0, 0, 0);
+    }
+
+    return std::make_tuple(true, width, in_channels, out_channels);
+}
+
+static bool Convert(std::shared_ptr<ngraph::Node> matmul_node,
+                    std::shared_ptr<ngraph::Node> add,
+                    std::shared_ptr<ngraph::Node> bias,
+                    std::shared_ptr<ngraph::Node> fq) {
+    bool supported;
+    uint32_t width, in_channels, out_channels;
+    std::tie(supported, width, in_channels, out_channels) = VerifyAndGetConvParams(matmul_node);
+    if (!supported) return false;
+
+    auto input_node = matmul_node->input_value(0).get_node_shared_ptr();
+    auto weights_node = matmul_node->input_value(1).get_node_shared_ptr();
+    auto base_name = matmul_node->get_friendly_name();
+
+    auto reshape_const_before = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+                                                                            ngraph::Shape{4},
+                                                                            ngraph::Shape{1, 1, width, in_channels});
+    auto reshape_before =  std::make_shared<ngraph::opset7::Reshape>(input_node, reshape_const_before, false);
+    reshape_before->set_friendly_name(base_name + "/reshape_in");
+
+    auto transpose_before = std::make_shared<ngraph::opset7::Transpose>(reshape_before,
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4},
+        GetPermuteOrder(InferenceEngine::Layout::NHWC, InferenceEngine::Layout::NCHW)));
+    transpose_before->set_friendly_name(base_name + "/transpose_in");
+
+    auto weights_reshape_const = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+        ngraph::Shape{4}, ngraph::Shape{out_channels, in_channels, 1, 1});
+    auto weights_reshaped =  std::make_shared<ngraph::opset7::Reshape>(weights_node, weights_reshape_const, false);
+
+    std::shared_ptr<ngraph::Node> conv_node = std::make_shared<ngraph::opset7::Convolution>(transpose_before, weights_reshaped,
+            ngraph::Strides{1, 1}, ngraph::CoordinateDiff{0, 0}, ngraph::CoordinateDiff{0, 0},
+            ngraph::Strides{1, 1}, ngraph::op::PadType::VALID);
+    conv_node->set_friendly_name(base_name + "/conv");
+
+    std::shared_ptr<ngraph::Node> root_node = matmul_node;
+    if (bias != nullptr) {
+         conv_node = std::make_shared<ngraph::opset7::Add>(conv_node, bias);
+         root_node = add;
+    }
+
+    if (fq != nullptr) {
+        conv_node = fq->clone_with_new_inputs({conv_node, fq->input_value(1), fq->input_value(2),
+            fq->input_value(3), fq->input_value(4)});
+        root_node = fq;
+    }
+
+    auto transpose_after = std::make_shared<ngraph::opset7::Transpose>(conv_node,
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4},
+        GetPermuteOrder(InferenceEngine::Layout::NCHW, InferenceEngine::Layout::NHWC)));
+    transpose_after->set_friendly_name(base_name + "/transpose_out");
+
+    auto output_shape = matmul_node->get_output_shape(0);
+    output_shape[output_shape.size() - 1] = out_channels;
+    output_shape[output_shape.size() - 2] = width;
+    auto reshape_const_after = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+                                                                            ngraph::Shape{output_shape.size()},
+                                                                            output_shape);
+    auto reshape_after =  std::make_shared<ngraph::opset7::Reshape>(transpose_after, reshape_const_after, false);
+    reshape_after->set_friendly_name(base_name);
+
+    ngraph::replace_node(root_node, reshape_after);
+    return true;
+}
+
+ConvertMatmulToPointWiseConvolution::ConvertMatmulToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), nullptr, nullptr, nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(matmul, "ConvertMatmulToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
+
+ConvertMatmulWithBiasToPointWiseConvolution::ConvertMatmulWithBiasToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({matmul, bias});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(),
+            pattern_map.at(bias).get_node_shared_ptr(), nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(add, "ConvertMatmulWithBiasToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
+
+ConvertMatmulWithFqToPointWiseConvolution::ConvertMatmulWithFqToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({matmul, bias});
+    auto matmul_out = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{add, matmul});
+    auto out_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({matmul_out,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto add_it = pattern_map.find(add);
+        auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr());
+        auto bias_it = pattern_map.find(bias);
+        auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr());
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), add_node, bias_node,
+             pattern_map.at(out_fq).get_node_shared_ptr());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(out_fq, "ConvertMatmulWithFqToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
--- a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp
+++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp
@ -0,0 +1,71 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *                                                  |
+ *                                      Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulToPointWiseConvolution();
+};
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it, moved add with bias before the last transpose:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *       |                                         |
+ *      Add (const)                            Add (const)
+ *                                                 |
+ *                                      Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulWithBiasToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulWithBiasToPointWiseConvolution();
+};
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it, moved add with bias and/or fake quantize before the last transpose:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *       |                                         |
+ *      Add (const)                            Add (const)
+ *       |                                         |
+ *     FakeQuantize                            FakeQuantize
+ *                                                 |
+ *                                         Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulWithFqToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulWithFqToPointWiseConvolution();
+};
+
+} // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
@ -0,0 +1,131 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/split_convolution_with_large_buffer_size.hpp"
+
+#include <numeric>
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "backend/gna_limitations.hpp"
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(SplitConvolution, "SplitConvolution", 0);
+NGRAPH_RTTI_DEFINITION(SplitConvolutionWithBias, "SplitConvolutionWithBias", 0);
+NGRAPH_RTTI_DEFINITION(SplitConvolutionWithFq, "SplitConvolutionWithFq", 0);
+
+static std::vector<int64_t> GetConvSplitSizes(std::shared_ptr<ngraph::Node> conv) {
+    uint32_t width = conv->get_input_shape(0).back();
+    uint32_t in_channels = conv->get_input_shape(0).at(1);
+    uint32_t usedWidth = 0;
+    std::vector<int64_t> split_sizes;
+    uint32_t width_max_size = GNALimitations::bufferMaxSize / in_channels;
+    width_max_size = width_max_size - width_max_size % 64;
+    while (usedWidth < width) {
+        uint32_t width_part = std::min(width - usedWidth, width_max_size);
+        split_sizes.push_back(width_part);
+        usedWidth += width_part;
+    }
+    IE_ASSERT(usedWidth == width);
+    return split_sizes;
+}
+
+static bool Convert(std::shared_ptr<ngraph::Node> conv,
+                    std::shared_ptr<ngraph::Node> add,
+                    std::shared_ptr<ngraph::Node> bias,
+                    std::shared_ptr<ngraph::Node> fq) {
+    auto input_size = std::accumulate(std::begin(conv->get_input_shape(0)),
+        std::end(conv->get_input_shape(0)), 1, std::multiplies<size_t>());
+    if (input_size <= GNALimitations::bufferMaxSize) {
+        return false;
+    }
+
+    auto split_sizes = GetConvSplitSizes(conv);
+    IE_ASSERT(split_sizes.size() > 1);
+
+    /* TODO check if it's NHWC convolution wrapped with transposes or all input dimensions except of width == 1,
+        otherwise this split axis isn't supported */
+    const int64_t width_axis = conv->get_input_shape(0).size() - 1;
+    auto split_node = std::make_shared<ngraph::opset7::VariadicSplit>(conv->input_value(0),
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{width_axis}),
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_sizes.size()}), split_sizes));
+    split_node->set_friendly_name(conv->get_friendly_name() + "/split");
+    ngraph::OutputVector convOutputs;
+    std::shared_ptr<ngraph::Node> root_node = fq ? fq : (add ? add : conv);
+    for (int i = 0; i < split_sizes.size(); ++i) {
+        std::shared_ptr<ngraph::Node> output = conv->clone_with_new_inputs({split_node->output(i), conv->input_value(1)});
+        output->set_friendly_name(conv->get_friendly_name() + "_" + std::to_string(i));
+        if (bias) {
+            output = std::make_shared<ngraph::opset7::Add>(output, bias);
+        }
+
+        if (fq) {
+            output = fq->clone_with_new_inputs({output, fq->input_value(1), fq->input_value(2),
+                fq->input_value(3), fq->input_value(4)});
+        }
+        convOutputs.push_back(output);
+    }
+
+    auto concat = std::make_shared<ngraph::opset7::Concat>(convOutputs, width_axis);
+    concat->set_friendly_name(conv->get_friendly_name());
+    ngraph::replace_node(root_node, concat);
+    return true;
+}
+
+SplitConvolution::SplitConvolution() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), nullptr, nullptr, nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(conv, "SplitConvolution");
+    this->register_matcher(m, callback);
+}
+
+SplitConvolutionWithBias::SplitConvolutionWithBias() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({conv, bias});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(),
+            pattern_map.at(bias).get_node_shared_ptr(), nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(add, "SplitConvolutionWithBias");
+    this->register_matcher(m, callback);
+}
+
+SplitConvolutionWithFq::SplitConvolutionWithFq() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({conv, bias});
+    auto conv_output = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{conv, add});
+    auto out_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({conv_output,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto add_it = pattern_map.find(add);
+        auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr());
+        auto bias_it = pattern_map.find(bias);
+        auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr());
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), add_node, bias_node, pattern_map.at(out_fq).get_node_shared_ptr());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(out_fq, "SplitConvolutionWithFq");
+    this->register_matcher(m, callback);
+}
--- a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp
@ -0,0 +1,34 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+// @brief Splits convolution with large input buffer
+class SplitConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolution();
+};
+
+// @brief Splits convolution with large input buffer, move add with bias to each convolution before concat
+class SplitConvolutionWithBias : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolutionWithBias();
+};
+
+/* @brief Splits convolution with large input buffer,
+ * move add with bias and/or fake quantize to each convolution before concat
+ */
+class SplitConvolutionWithFq : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolutionWithFq();
+};
+
+} // namespace GNAPluginNS
--- a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
@ -312,6 +312,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
    struct Subgraph {
        ngraph::ResultVector    _results;
        ngraph::ParameterVector _parameters;
+        ngraph::SinkVector      _sinks;
        std::string             _affinity;
    };
    std::unordered_map<int, Subgraph> subgraphs;
@ -325,6 +326,9 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
        } else if (ngraph::op::is_parameter(node)) {
            subgraph._parameters.emplace_back(
                std::dynamic_pointer_cast<ngraph::op::v0::Parameter>(node->shared_from_this()));
+        } else if (ngraph::op::is_sink(node)) {
+            subgraph._sinks.emplace_back(
+                    std::dynamic_pointer_cast<ngraph::op::Sink>(node->shared_from_this()));
        }
        auto itAffinity = affinities.find(node);
        if (itAffinity != affinities.end()) {
@ -373,7 +377,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
    for (auto&& subgraph : orderedSubgraphs) {
        _networks[id]._device = subgraph._affinity;
        subFunctions[id] =
-            std::make_shared<ngraph::Function>(subgraph._results, subgraph._parameters,
+            std::make_shared<ngraph::Function>(subgraph._results, subgraph._sinks, subgraph._parameters,
                                                     _name + '_' + std::to_string(id));
        _networks[id]._clonedNetwork = CNNNetwork{subFunctions[id]};
        // update of pre-processing info
@ -550,7 +554,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(std::istream&
    this->SetPointerToPlugin(_heteroPlugin->shared_from_this());
 }

-void HeteroExecutableNetwork::ExportImpl(std::ostream& heteroModel) {
+void HeteroExecutableNetwork::Export(std::ostream& heteroModel) {
    pugi::xml_document doc;
    auto heteroNode = doc.append_child("hetero");
    heteroNode.append_attribute("name").set_value(_name.c_str());
--- a/inference-engine/src/hetero_plugin/hetero_executable_network.hpp
+++ b/inference-engine/src/hetero_plugin/hetero_executable_network.hpp
@ -56,7 +56,7 @@ public:

    InferenceEngine::Parameter GetMetric(const std::string &name) const override;

-    void ExportImpl(std::ostream& modelFile) override;
+    void Export(std::ostream& modelFile) override;

 private:
    void InitCNNImpl(const InferenceEngine::CNNNetwork&    network);
--- a/inference-engine/src/hetero_plugin/hetero_plugin.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_plugin.cpp
@ -57,13 +57,8 @@ InferenceEngine::IExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(cons
    return std::make_shared<HeteroExecutableNetwork>(network, mergeConfigs(_config, config), this);
 }

-InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetworkImpl(std::istream& heteroModel, const Configs& config) {
-    if (GetCore() == nullptr) {
-        IE_THROW() << "Please, work with HETERO device via InferencEngine::Core object";
-    }
-
-    return std::make_shared<HeteroExecutableNetwork>(heteroModel,
-        mergeConfigs(_config, config), this);
+InferenceEngine::IExecutableNetworkInternal::Ptr Engine::ImportNetwork(std::istream& heteroModel, const std::map<std::string, std::string>& config) {
+    return std::make_shared<HeteroExecutableNetwork>(heteroModel, mergeConfigs(_config, config), this);
 }

 Engine::Configs Engine::GetSupportedConfig(const Engine::Configs& config, const std::string & deviceName) const {
--- a/inference-engine/src/hetero_plugin/hetero_plugin.hpp
+++ b/inference-engine/src/hetero_plugin/hetero_plugin.hpp
@ -37,7 +37,8 @@ public:
    InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string,
                                         InferenceEngine::Parameter> & options) const override;

-    InferenceEngine::IExecutableNetworkInternal::Ptr ImportNetworkImpl(std::istream& heteroModel, const Configs& config) override;
+    InferenceEngine::IExecutableNetworkInternal::Ptr
+    ImportNetwork(std::istream& heteroModel, const std::map<std::string, std::string>& config) override;

    DeviceMetaInformationMap GetDevicePlugins(const std::string& targetFallback,
                                              const Configs & localConfig) const;
--- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp
+++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.cpp
@ -49,19 +49,17 @@ std::shared_ptr<IInferRequestInternal> IExecutableNetworkInternal::CreateInferRe
 }

 void IExecutableNetworkInternal::Export(const std::string& modelFileName) {
-    // we need to write to stringstream first
-    // because in case of exception in ExportImpl the file is not created
-    std::stringstream strm;
-    ExportImpl(strm);
-    std::ofstream(modelFileName.c_str()) << strm.rdbuf();
+    std::ofstream modelFile(modelFileName, std::ios::out | std::ios::binary);
+
+    if (modelFile.is_open()) {
+        Export(modelFile);
+    } else {
+        IE_THROW() << "The " << modelFileName << " file can not be opened for Export";
+    }
 }

 void IExecutableNetworkInternal::Export(std::ostream& networkModel) {
-    std::stringstream strm;
-    strm.write(exportMagic.data(), exportMagic.size());
-    strm << _plugin->GetName() << std::endl;
-    ExportImpl(strm);
-    networkModel << strm.rdbuf();
+    IE_THROW(NotImplemented);
 }

 CNNNetwork IExecutableNetworkInternal::GetExecGraphInfo() {
@ -97,7 +95,4 @@ std::shared_ptr<IInferRequestInternal> IExecutableNetworkInternal::CreateInferRe
    IE_THROW(NotImplemented);
 }

-void IExecutableNetworkInternal::ExportImpl(std::ostream&) {
-    IE_THROW(NotImplemented);
-}
 }  // namespace InferenceEngine
--- a/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp
+++ b/inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.cpp
@ -16,24 +16,12 @@
 #include <blob_factory.hpp>

 #include <istream>
+#include <fstream>
 #include <map>
 #include <memory>
 #include <string>

 namespace InferenceEngine {
-namespace {
-void parsePluginName(std::istream& networkModel) {
-    ExportMagic magic = {};
-    auto currentPos = networkModel.tellg();
-    networkModel.read(magic.data(), magic.size());
-    auto exportedWithName = (exportMagic == magic);
-    if (exportedWithName) {
-        networkModel.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
-    } else {
-        networkModel.seekg(currentPos, networkModel.beg);
-    }
-}
-}  // namespace

 PreProcessInfo copyPreProcess(const PreProcessInfo& from) {
    PreProcessInfo to = from;
@ -170,22 +158,26 @@ RemoteContext::Ptr IInferencePlugin::GetDefaultContext(const ParamMap&) {
    IE_THROW(NotImplemented);
 }

-std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(const std::string&,
-                                                                            const std::map<std::string, std::string>&) {
-    IE_THROW(NotImplemented);
+std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(const std::string& modelFileName,
+                                                                            const std::map<std::string, std::string>& config) {
+    std::ifstream blobFile(modelFileName, std::ios::binary);
+
+    if (!blobFile.is_open()) {
+        IE_THROW(NetworkNotRead);
+    }
+
+    return ImportNetwork(blobFile, config);
 }

 std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(std::istream& networkModel,
                                                                            const std::map<std::string, std::string>& config) {
-    parsePluginName(networkModel);
-    return ImportNetworkImpl(networkModel, config);
+    IE_THROW(NotImplemented);
 }

 std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetwork(std::istream& networkModel,
                                                                            const std::shared_ptr<RemoteContext>& context,
                                                                            const std::map<std::string, std::string>& config) {
-    parsePluginName(networkModel);
-    return ImportNetworkImpl(networkModel, context, config);
+   IE_THROW(NotImplemented);
 }

 void IInferencePlugin::SetCore(ICore* core) {
@ -213,17 +205,6 @@ std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::LoadExeNetworkImpl
    IE_THROW(NotImplemented);
 }

-std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetworkImpl(std::istream&,
-                                                                                const std::map<std::string, std::string>&) {
-    IE_THROW(NotImplemented);
-}
-
-std::shared_ptr<IExecutableNetworkInternal> IInferencePlugin::ImportNetworkImpl(std::istream&,
-                                                                                const std::shared_ptr<RemoteContext>&,
-                                                                                const std::map<std::string, std::string>&) {
-    IE_THROW(NotImplemented);
-}
-
 void IInferencePlugin::SetExeNetworkInfo(const std::shared_ptr<IExecutableNetworkInternal>& exeNetwork,
                                         const ConstInputsDataMap& inputs,
                                         const ConstOutputsDataMap& outputs) {
--- a/inference-engine/src/inference_engine/ie_core.cpp
+++ b/inference-engine/src/inference_engine/ie_core.cpp
@ -395,6 +395,7 @@ public:
        opsetNames.insert("opset4");
        opsetNames.insert("opset5");
        opsetNames.insert("opset6");
+        opsetNames.insert("opset7");
    }

    ~Impl() override = default;
@ -566,18 +567,6 @@ public:
    SoExecutableNetworkInternal ImportNetwork(std::istream& networkModel, const std::string& deviceName,
                                              const std::map<std::string, std::string>& config) override {
        auto parsed = parseDeviceNameIntoConfig(deviceName, config);
-
-        if (parsed._deviceName.empty()) {
-            ExportMagic magic = {};
-            auto currentPos = networkModel.tellg();
-            networkModel.read(magic.data(), magic.size());
-            auto exportedWithName = (exportMagic == magic);
-            if (exportedWithName) {
-                std::getline(networkModel, parsed._deviceName);
-            }
-            networkModel.seekg(currentPos, networkModel.beg);
-        }
-
        return GetCPPPluginByName(parsed._deviceName).ImportNetwork(networkModel, parsed._config);
    }

@ -1022,18 +1011,6 @@ void Core::AddExtension(const IExtensionPtr& extension) {
 ExecutableNetwork Core::ImportNetwork(const std::string& modelFileName, const std::string& deviceName,
                                      const std::map<std::string, std::string>& config) {
    OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork");
-
-    // TODO: remove once NotImplemented exception is deprecated and not used
-    if (deviceName.find("HETERO") == 0) {
-        IE_THROW() << "HETERO device does not support ImportNetwork";
-    }
-    if (deviceName.find("MULTI") == 0) {
-        IE_THROW() << "MULTI device does not support ImportNetwork";
-    }
-    if (deviceName.find("AUTO") == 0) {
-        IE_THROW() << "AUTO device does not support ImportNetwork";
-    }
-
    auto parsed = parseDeviceNameIntoConfig(deviceName, config);
    auto exec = _impl->GetCPPPluginByName(parsed._deviceName).ImportNetwork(modelFileName, parsed._config);
    return { exec, exec };
@ -1041,10 +1018,33 @@ ExecutableNetwork Core::ImportNetwork(const std::string& modelFileName, const st

 ExecutableNetwork Core::ImportNetwork(std::istream& networkModel, const std::string& deviceName,
                                      const std::map<std::string, std::string>& config) {
+    OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork");
    auto exec = _impl->ImportNetwork(networkModel, deviceName, config);
    return { exec, exec };
 }

+ExecutableNetwork Core::ImportNetwork(std::istream& networkModel) {
+    OV_ITT_SCOPED_TASK(itt::domains::IE, "Core::ImportNetwork");
+
+    using ExportMagic = std::array<char, 4>;
+    constexpr static const ExportMagic exportMagic = {{0x1, 0xE, 0xE, 0x1}};
+
+    std::string deviceName;
+    ExportMagic magic = {};
+    auto currentPos = networkModel.tellg();
+    networkModel.read(magic.data(), magic.size());
+    if (exportMagic == magic) {
+        std::getline(networkModel, deviceName);
+    } else {
+        IE_THROW() << "Passed compiled stream does not contain device name. "
+            "Please, provide device name manually";
+    }
+    networkModel.seekg(currentPos, networkModel.beg);
+
+    auto exec = _impl->GetCPPPluginByName(deviceName).ImportNetwork(networkModel, {});
+    return { exec, exec };
+}
+
 ExecutableNetwork Core::ImportNetwork(std::istream& networkModel,
                                      const RemoteContext::Ptr& context,
                                      const std::map<std::string, std::string>& config) {
--- a/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
+++ b/inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp
@ -244,6 +244,9 @@ CNNLayer::Ptr createSubGraphLayer(const std::shared_ptr<ngraph::Node>& layer) {
    LayerParams params = {layer->get_friendly_name(), "TensorIterator",
                          details::convertPrecision(layer->get_output_element_type(0))};
    auto res = std::make_shared<InferenceEngine::TensorIterator>(params);
+    if (res == nullptr) {
+        IE_THROW() << "Can't create TensorIterator";
+    }
    res->body = body;

    // Port map: outputs
--- a/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp
+++ b/inference-engine/src/legacy_api/src/transformations/convert_opset1_to_legacy/convert_strided_slice_to_crop.cpp
@ -9,7 +9,6 @@
 #include <vector>

 #include <ngraph/opsets/opset1.hpp>
-
 #include <legacy/ngraph_ops/crop_ie.hpp>
 #include <ngraph/rt_info.hpp>

@ -137,7 +136,6 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher
                    lb = std::min(static_cast<int64_t>(input_shape[input_shape_idx]), lb);
                    ub = std::min(static_cast<int64_t>(input_shape[input_shape_idx]), ub);

-                    offset.emplace_back(lb);

                    // set default value for stride or use given value
                    int64_t stride = 1;
@ -153,6 +151,7 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher
                            ub = -1;

                        lb = std::min(lb, static_cast<int64_t>(input_shape[input_shape_idx]) - 1);
+                        offset.emplace_back(lb);
                        lb -= 1;  // we always get 1st element, so we need decrease range
                        if (ub <= lb)
                            dimension = (ub - lb) / stride + 1;
@ -160,13 +159,17 @@ ngraph::pass::ConvertStridedSliceToCropMatcher::ConvertStridedSliceToCropMatcher
                        // apply masks
                        if (begin_mask.count(axis))
                            lb = 0;
-                        if (end_mask.count(axis))
+                        offset.emplace_back(lb);
+
+                        if (end_mask.count(axis)) {
                            ub = static_cast<int64_t>(input_shape[input_shape_idx]);
+                        }

                        lb += 1;  // we always get 1st element, so we need decrease range
-                        if (ub >= lb)
+                        if (ub >= lb) {
                            dimension = (ub - lb) / stride + 1;
                        }
+                    }

                    dim.emplace_back(dimension);
                    input_shape_idx++;
--- a/inference-engine/src/low_precision_transformations/src/concat.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat.cpp
@ -43,19 +43,21 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat
        return false;
    }

-    // precisions can be different
+    // Concat operations precision is defined:
+    // 1. consumers after Concat
+    // 2. FakeQuantize precisions without zero point
    ngraph::Node& quantizationLayer = *subgraph.quantizationLayers[0];
    std::shared_ptr<ngraph::opset1::FakeQuantize> fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(quantizationLayer.shared_from_this());
    if (!NetworkHelper::isQuantizeSupported(fq)) {
        return false;
    }
-
-    std::vector<element::Type> concatParentsChildrensPrecisions = precisionsOnActivations;
-    fillAvailablePrecisions(subgraph.quantizationLayers[0], concatParentsChildrensPrecisions);
-    if (concatParentsChildrensPrecisions.empty()) {
+    DataPrecision dataPrecision = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
+    if (dataPrecision.precision == ngraph::element::undefined) {
        return false;
    }

+    std::vector<element::Type> concatChildrenPrecisions = precisionsOnActivations;
+
    for (size_t i = 0; i < subgraph.quantizationLayers.size(); ++i) {
        fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(subgraph.quantizationLayers[i]);
        if (fq == nullptr) {
@ -72,20 +74,28 @@ bool ConcatTransformation::transform(TransformationContext& context, ngraph::pat
        if (quantizationDetails.inputHighValues.size() != 1ul) {
            return false;
        }
-        std::vector<element::Type> fqChildrensPrecisions = precisionsOnActivations;
-        fillAvailablePrecisions(subgraph.quantizationLayers[i], fqChildrensPrecisions);
-        concatParentsChildrensPrecisions = NetworkHelper::precisionIntersection(concatParentsChildrensPrecisions, fqChildrensPrecisions);

-        if (concatParentsChildrensPrecisions.empty()) {
+        // define concatenation operation consumers precisions
+        std::vector<element::Type> fqChildrenPrecisions = precisionsOnActivations;
+        fillAvailablePrecisions(subgraph.quantizationLayers[i], fqChildrenPrecisions);
+        concatChildrenPrecisions = NetworkHelper::precisionIntersection(concatChildrenPrecisions, fqChildrenPrecisions);
+        if (concatChildrenPrecisions.empty()) {
            return false;
        }
+
+        // define FakeQuantize precisions without zero point
+        const DataPrecision dataPrecision2 = getDataPrecision(subgraph.quantizationLayers[i]->shared_from_this(), quantizationDetails, false);
+        if (dataPrecision2.precision == ngraph::element::undefined) {
+            return false;
        }

-    DataPrecision dataPrecision;
-    if (std::find(concatParentsChildrensPrecisions.begin(), concatParentsChildrensPrecisions.end(), element::i8) != concatParentsChildrensPrecisions.end()) {
-        dataPrecision = DataPrecision(element::i8);
-    } else {
-        dataPrecision = DataPrecision(concatParentsChildrensPrecisions[0]);
+        if (dataPrecision.precision != dataPrecision2.precision) {
+            dataPrecision = dataPrecision.precision.is_signed() ? dataPrecision : dataPrecision2;
+        }
+    }
+
+    if (std::find(concatChildrenPrecisions.begin(), concatChildrenPrecisions.end(), dataPrecision.precision) == concatChildrenPrecisions.end()) {
+        dataPrecision = DataPrecision(concatChildrenPrecisions[0]);
    }

    std::vector<QuantizationDetails> quantizationLayersDetails;
--- a/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
+++ b/inference-engine/src/low_precision_transformations/src/concat_multi_channels.cpp
@ -64,14 +64,23 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context

    DataPrecision dataPrecision;
    {
+        std::vector<element::Type> concatChildrenPrecisions = precisionsOnActivations;
        for (auto quantizationLayer : subgraph.quantizationLayers) {
            std::shared_ptr<ngraph::opset1::FakeQuantize> fq = ngraph::as_type_ptr<ngraph::opset1::FakeQuantize>(quantizationLayer->shared_from_this());
            if (!NetworkHelper::isQuantizeSupported(fq)) {
                return false;
            }

-            const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
+            // define concatenation operation consumers precisions
+            std::vector<element::Type> fqChildrenPrecisions = precisionsOnActivations;
+            fillAvailablePrecisions(quantizationLayer, fqChildrenPrecisions);
+            concatChildrenPrecisions = NetworkHelper::precisionIntersection(concatChildrenPrecisions, fqChildrenPrecisions);
+            if (concatChildrenPrecisions.empty()) {
+                return false;
+            }

+            // define FakeQuantize precisions without zero point
+            const DataPrecision tmp = getDataPrecision(fq, QuantizationDetails::getDetails(fq), false);
            if (dataPrecision.precision == ngraph::element::undefined) {
                dataPrecision = tmp;
                continue;
@ -81,6 +90,10 @@ bool ConcatMultiChannelsTransformation::transform(TransformationContext& context
                dataPrecision = tmp;
            }
        }
+
+        if (std::find(concatChildrenPrecisions.begin(), concatChildrenPrecisions.end(), dataPrecision.precision) == concatChildrenPrecisions.end()) {
+            dataPrecision = DataPrecision(concatChildrenPrecisions[0]);
+        }
    }

    for (size_t i = 0; i < subgraph.quantizationLayers.size(); ++i) {
--- a/inference-engine/src/mkldnn_plugin/config.h
+++ b/inference-engine/src/mkldnn_plugin/config.h
@ -4,9 +4,11 @@

 #pragma once

+#include <threading/ie_istreams_executor.hpp>
+#include "utils/debug_capabilities.h"
+
 #include <string>
 #include <map>
-#include <threading/ie_istreams_executor.hpp>

 namespace MKLDNNPlugin {

@ -35,6 +37,10 @@ struct Config {
    bool manualEnforceBF16 = false;
 #endif

+#ifdef CPU_DEBUG_CAPS
+    DebugCaps::Config debugCaps;
+#endif
+
    void readProperties(const std::map<std::string, std::string> &config);
    void updateProperties();
    std::map<std::string, std::string> _config;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
@ -32,7 +32,7 @@ bool MKLDNNEdge::isUseExternalMemory() const {
    return externalMemoryPtr;
 }

-bool MKLDNNEdge::isDropped() {
+bool MKLDNNEdge::isDropped() const {
    bool not_in_parent = true;
    bool not_in_child = true;

@ -124,6 +124,10 @@ void MKLDNNEdge::reuse(MKLDNNMemoryPtr ptr) {
    status = Status::Allocated;
 }

+const InferenceEngine::TensorDesc& MKLDNNEdge::getInputDescRO() const {
+    return inputDesc;
+}
+
 InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() {
    if (inputDesc.getLayout() == InferenceEngine::Layout::ANY) {
        inputDesc = getSpecifiedInputDesc({});
@ -131,6 +135,10 @@ InferenceEngine::TensorDesc MKLDNNEdge::getInputDesc() {
    return inputDesc;
 }

+const InferenceEngine::TensorDesc& MKLDNNEdge::getOutputDescRO() const {
+    return outputDesc;
+}
+
 InferenceEngine::TensorDesc MKLDNNEdge::getOutputDesc() {
    if (outputDesc.getLayout() == InferenceEngine::Layout::ANY) {
        outputDesc = getSpecifiedOutputDesc({});
@ -145,11 +153,11 @@ InferenceEngine::TensorDesc MKLDNNEdge::getDesc() {
    return getInputDesc();
 }

-int MKLDNNEdge::getInputNum() {
+int MKLDNNEdge::getInputNum() const {
    return parent_port;
 }

-int MKLDNNEdge::getOutputNum() {
+int MKLDNNEdge::getOutputNum() const {
    return child_port;
 }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
@ -61,11 +61,11 @@ public:
    MKLDNNMemoryPtr& getMemoryPtr();

    bool needReorder();
-    bool isDropped();
+    bool isDropped() const;
    bool isUseExternalMemory() const;

-    int getInputNum();
-    int getOutputNum();
+    int getInputNum() const;
+    int getOutputNum() const;

    void setChildPort(const size_t port) { child_port = port; }

@ -73,10 +73,12 @@ public:
    MKLDNNEdgePtr getSharedEdge() const;
    MKLDNNEdgePtr getSharedEdge(std::nothrow_t) const;

+    const InferenceEngine::TensorDesc& getInputDescRO() const;
+    const InferenceEngine::TensorDesc& getOutputDescRO() const;
+
 private:
    std::string name();

-private:
    std::weak_ptr<MKLDNNNode> parent;
    std::weak_ptr<MKLDNNNode> child;
    int parent_port;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -78,7 +78,10 @@ void MKLDNNGraph::CreateGraph(NET &net, const MKLDNNExtensionManager::Ptr& extMg

    Replicate(net, extMgr);
    InitGraph();
+
    status = Ready;
+
+    ENABLE_CPU_DEBUG_CAP(serialize(*this));
 }

 template void MKLDNNGraph::CreateGraph(const std::shared_ptr<const ngraph::Function>&,
@ -344,10 +347,6 @@ void MKLDNNGraph::InitGraph() {
        graphNode->cleanup();
    }
 #endif
-
-#if !defined(NDEBUG) && defined(PRINT_GRAPH_INFO)
-    printGraphInfo();
-#endif
    ExecuteConstantNodesOnly();
 }

@ -809,7 +808,7 @@ void MKLDNNGraph::Infer(MKLDNNInferRequest* request, int batch) {

    mkldnn::stream stream(eng);

-    ENABLE_CPU_DEBUG_CAP(NodeDumper nd(infer_count));
+    ENABLE_CPU_DEBUG_CAP(NodeDumper nd(config.debugCaps, infer_count));

    for (int i = 0; i < graphNodes.size(); i++) {
        if (request != nullptr) {
@ -954,6 +953,10 @@ void MKLDNNGraph::setConfig(const Config &cfg) {
    config = cfg;
 }

+const Config& MKLDNNGraph::getConfig() const {
+    return config;
+}
+
 void MKLDNNGraph::setProperty(const std::map<std::string, std::string>& properties) {
    config.readProperties(properties);
 }
@ -1217,21 +1220,3 @@ void MKLDNNGraph::EnforceBF16() {
 InferenceEngine::CNNNetwork MKLDNNGraph::dump() const {
    return dump_graph_as_ie_ngraph_net(*this);
 }
-
-void MKLDNNGraph::printGraphInfo() const {
-    for (auto &graphNode : graphNodes) {
-        std::cout << "name: " << graphNode->getName() << " [ ";
-        if (graphNode->parentEdges.size() > 0) {
-            auto prnt_out_desc = graphNode->parentEdges[0].lock()->getOutputDesc();
-            std::cout << "in: " << prnt_out_desc.getPrecision().name()
-                      << "/l=" << prnt_out_desc.getLayout()
-                      << "; ";
-        }
-        if (graphNode->childEdges.size() > 0) {
-            auto chld_in_desc = graphNode->childEdges[0].lock()->getInputDesc();
-            std::cout << "out: " << chld_in_desc.getPrecision().name()
-                      << "/l=" << chld_in_desc.getLayout();
-        }
-        std::cout << " ]"  << std::endl;
-    }
-}
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@ -39,6 +39,8 @@ public:
    }

    void setConfig(const Config &cfg);
+    const Config& getConfig() const;
+
    void setProperty(const std::map<std::string, std::string> &properties);
    Config getProperty() const;

@ -59,6 +61,10 @@ public:

    void Infer(MKLDNNInferRequest* request = nullptr, int batch = -1);

+    const std::vector<MKLDNNNodePtr>& GetNodes() const {
+        return graphNodes;
+    }
+
    std::vector<MKLDNNNodePtr>& GetNodes() {
        return graphNodes;
    }
@ -219,7 +225,6 @@ protected:

 private:
    void EnforceBF16();
-    void printGraphInfo() const;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
@ -5,9 +5,11 @@
 #include "mkldnn_graph_dumper.h"
 #include <ie_ngraph_utils.hpp>
 #include "exec_graph_info.hpp"
+#include "ie_common.h"
 #include "mkldnn_debug.h"
 #include <ngraph/variant.hpp>
 #include "ngraph/ngraph.hpp"
+#include "utils/debug_capabilities.h"

 #include <vector>
 #include <string>
@ -18,6 +20,9 @@ using namespace InferenceEngine;

 namespace MKLDNNPlugin {

+void serializeToCout(const MKLDNNGraph &graph);
+void serializeToXML(const MKLDNNGraph &graph, const std::string& path);
+
 namespace {

 std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &node) {
@ -207,4 +212,46 @@ InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph
    return net;
 }

+#ifdef CPU_DEBUG_CAPS
+void serialize(const MKLDNNGraph &graph) {
+    const std::string& path = graph.getConfig().debugCaps.execGraphPath;
+
+    if (path.empty())
+        return;
+
+    if (path == "cout")
+        serializeToCout(graph);
+    else if (!path.compare(path.size() - 4, 4, ".xml"))
+        serializeToXML(graph, path);
+    else
+        IE_THROW() << "Unknown serialize format. Should be either 'cout' or '*.xml'. Got " << path;
+}
+
+void serializeToXML(const MKLDNNGraph &graph, const std::string& path) {
+    if (path.empty())
+        return;
+
+    graph.dump().serialize(path);
+}
+
+void serializeToCout(const MKLDNNGraph &graph) {
+    for (const auto& node : graph.GetNodes()) {
+        std::cout << "name: " << node->getName() << " [ ";
+        if (!node->getParentEdges().empty()) {
+            const auto& parentEdge = *(node->getParentEdges()[0].lock());
+            const auto& prnt_out_desc = parentEdge.getOutputDescRO();
+            std::cout << "in: " << prnt_out_desc.getPrecision().name()
+                      << "/l=" << prnt_out_desc.getLayout()
+                      << "; ";
+        }
+        if (!node->getChildEdges().empty()) {
+            const auto& childEdge = *(node->getChildEdges()[0].lock());
+            const auto& chld_in_desc = childEdge.getInputDescRO();
+            std::cout << "out: " << chld_in_desc.getPrecision().name()
+                      << "/l=" << chld_in_desc.getLayout();
+        }
+        std::cout << " ]"  << std::endl;
+    }
+}
+#endif
 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
@ -6,11 +6,14 @@

 #include "cpp/ie_cnn_network.h"
 #include "mkldnn_graph.h"
+#include "utils/debug_capabilities.h"

 #include <memory>

 namespace MKLDNNPlugin {

 InferenceEngine::CNNNetwork dump_graph_as_ie_ngraph_net(const MKLDNNGraph &graph);
-
+#ifdef CPU_DEBUG_CAPS
+void serialize(const MKLDNNGraph &graph);
+#endif // CPU_DEBUG_CAPS
 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fc_fusion.cpp
@ -22,7 +22,11 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {

    ngraph::matcher_pass_callback callback = [this](ngraph::pattern::Matcher &m) {
        auto fc = std::dynamic_pointer_cast<MKLDNNPlugin::FullyConnectedNode>(m.get_match_root());
+        if (!fc)
+            return false;
        auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(fc->get_input_node_shared_ptr(0));
+        if (!reshape)
+            return false;

        // Check that Reshape reshapes 4D tensor to 2D or input shape = output shape
        auto shape_in = reshape->input_value(0).get_shape();
@ -67,6 +71,8 @@ MKLDNNPlugin::ReshapeFullyConnectedFusion::ReshapeFullyConnectedFusion() {
                                                                        fc->input_value(2),
                                                                        outShape,
                                                                        fc->output(0).get_element_type());
+        } else {
+            return false;
        }
        new_ops.push_back(new_fc);
        new_fc->set_friendly_name(fc->get_friendly_name());
--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_fully_connected.cpp
@ -60,6 +60,8 @@ MKLDNNPlugin::ReshapeFullyConnected::ReshapeFullyConnected() {
                                                                        fc->input_value(2),
                                                                        output_shape_new,
                                                                        fc->get_output_type());
+        } else {
+            return false;
        }
        new_ops.push_back(fc_new);

--- a/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp
+++ b/inference-engine/src/mkldnn_plugin/ngraph_transformations/reshape_prelu.cpp
@ -20,8 +20,16 @@ MKLDNNPlugin::ReshapePRelu::ReshapePRelu() {
        if (!prelu || ngraph::shape_size(prelu->get_input_shape(1)) == 1 || prelu->get_input_shape(1).size() != 1) {
            return false;
        }
-        ngraph::Shape new_shape(prelu->input_value(0).get_shape().size(), 1);
-        new_shape[new_shape.size() > 1 ? 1 : 0] = prelu->input_value(1).get_shape()[0];
+        const auto prelu_shape = prelu->input_value(0).get_shape();
+        const auto slope_shape = prelu->input_value(1).get_shape();
+        ngraph::Shape new_shape(prelu_shape.size(), 1);
+        const auto slope_dim = slope_shape[0];
+        const auto channel_dim_idx = prelu_shape.size() > 1 ? 1 : 0;
+        if (slope_dim != prelu_shape[channel_dim_idx]) {
+            return false;
+        }
+        new_shape[channel_dim_idx] = slope_dim;
+
        auto slope = ngraph::op::util::reshapeTo(prelu->input_value(1), new_shape);
        auto new_prelu = std::make_shared<ngraph::opset1::PRelu>(prelu->input(0).get_source_output(), slope);
        new_prelu->set_friendly_name(prelu->get_friendly_name());
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_convert_node.cpp
@ -42,6 +42,8 @@ MKLDNNConvertNode::MKLDNNConvertNode(const InferenceEngine::SizeVector &dims, co
    addOriginalInputPrecision(inPrc);
    outDims.emplace_back(dims);
    addOriginalOutputPrecision(outPrc);
+
+    errorPrefix = "Convert node with name '" + getName() + "'";
 }

 void MKLDNNConvertNode::getSupportedDescriptors() {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_depth_to_space_node.cpp
@ -58,6 +58,8 @@ MKLDNNDepthToSpaceNode::MKLDNNDepthToSpaceNode(const std::shared_ptr<ngraph::Nod
        if (blockSize == 0)
            THROW_ERROR << "has incorrect block_size parameter is zero!";

+        size_t nSpatialDims = inDims[0].ndims() - 2;
+        blockStep = static_cast<size_t>(std::pow(blockSize, nSpatialDims));
    } else {
        IE_THROW(NotImplemented) << errorMessage;
    }
@ -74,14 +76,13 @@ void MKLDNNDepthToSpaceNode::getSupportedDescriptors() {
    if (srcDims.size() != dstDims.size())
        THROW_ERROR << "has incorrect number of input/output dimensions";

-    size_t nSpatialDims = srcDims.size() - 2;
-    blockStep = static_cast<size_t>(std::pow(blockSize, nSpatialDims));
    if (srcDims[1] % blockStep)
        THROW_ERROR << "has block_size parameter which is incompatible with input tensor channels dimension size";

    if (srcDims[1] / blockStep != dstDims[1])
        THROW_ERROR << "has incompatible input/output channels";

+    size_t nSpatialDims = srcDims.size() - 2;
    for (size_t i = 0; i < nSpatialDims; ++i) {
        if (srcDims[i + 2] * blockSize != dstDims[i + 2])
            THROW_ERROR << "has incompatible spatial dims";
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_dft_node.cpp
@ -49,7 +49,7 @@ MKLDNNDFTNode::MKLDNNDFTNode(const std::shared_ptr<ngraph::Node>& op, const mkld

    /* Data */
    inputShape = inDims[DATA_INDEX].ToSizeVector();
-    if (inputShape.size() < 1) {
+    if (inputShape.size() < 2) {
        IE_THROW() << layerErrorPrefix << " has invalid 'data' input tensor with rank: " << inputShape.size();
    }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gather_elements_node.h
@ -32,7 +32,7 @@ private:
    size_t dataTypeSize_;
    int strideAxDst_;
    int dstAxDim_;
-    int strideAx1Diff_;
+    int strideAx1Diff_ = 0;
    std::string errorPrefix_;

    template <typename dataType>
--- a/Show More
+++ b/Show More