Merge remote-tracking branch 'upstream/master'

2021-06-17 08:28:23 +09:00 · 2021-06-17 08:28:23 +09:00 · 6ce8d8ce66
commit 6ce8d8ce66
parent 84bc851ecf db67c1b2b9
888 changed files with 36386 additions and 20767 deletions
--- a/.ci/openvino-onnx/Jenkinsfile
+++ b/.ci/openvino-onnx/Jenkinsfile
@ -155,10 +155,9 @@ def getConfigurationsMap() {

 CONFIGURATION_WORKFLOW = { configuration ->
    node("OpenVINO") {
+        String workdir = "${HOME}/workspace/${BUILD_NUMBER}_${env.CHANGE_ID}_${configuration.name}"
        try {
            PROJECT_NAME = "openvino"
-            String workdir = "${HOME}/workspace/${BUILD_NUMBER}_${env.CHANGE_ID}_${configuration.name}"
-
            stage("Clone repository") {
                    prepare_repository(workdir)
            }
@ -185,10 +184,10 @@ CONFIGURATION_WORKFLOW = { configuration ->
        }
        finally {
            stage("Cleanup") {
-                deleteDir()
                String docker_container_name = get_docker_container_name(configuration)
                sh """
                    docker rm -f ${docker_container_name}
+                    rm -rf ${workdir}
                """
            }
        }
--- a/.github/workflows/mo.yml
+++ b/.github/workflows/mo.yml
@ -63,41 +63,3 @@ jobs:
          python3 -m xmlrunner discover -p *_test.py --output=../mo-ut-logs
        working-directory: model-optimizer

-  build_wheel:
-    name: Build Python wheel
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Install dependencies
-        run: |
-          python3 -m pip install --upgrade pip
-          python3 -m pip install wheel setuptools
-          python3 -m pip install tensorflow==2.3.0
-      
-      - name: Build
-        run: |
-          python3 setup.py sdist bdist_wheel
-        working-directory: model-optimizer
-      
-      - name: Test package content
-        run: |
-          echo "src = open('openvino_mo.egg-info/SOURCES.txt', 'rt').read().split()" | tee -a test_wheel.py
-          echo "ref = open('automation/package_BOM.txt', 'rt').read().split()"       | tee -a test_wheel.py
-          echo "for name in ref:"                                                    | tee -a test_wheel.py
-          echo "  if name.endswith('.py'):"                                          | tee -a test_wheel.py
-          echo "    assert name in src or './' + name in src, name + ' file missed'" | tee -a test_wheel.py
-          python3 test_wheel.py
-        working-directory: model-optimizer
-
-      - name: Test conversion
-        run: |
-          wget -q http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz
-          tar -xf mobilenet_v1_1.0_224.tgz
-          python3 -m pip install model-optimizer/dist/*.whl
-          python3 -m mo --input_model mobilenet_v1_1.0_224_frozen.pb --input_shape "[1,224,224,3]"
-
-      - uses: actions/upload-artifact@v2
-        with:
-          name: mo_wheel
-          path: "model-optimizer/dist/*.whl"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -169,10 +169,11 @@ ie_shellcheck_process(DIRECTORY "${OpenVINO_MAIN_SOURCE_DIR}"
                           "${IE_MAIN_SOURCE_DIR}/thirdparty"
                           "${IE_MAIN_SOURCE_DIR}/temp"
                           # TODO fix and enable back:
-                           "${OpenVINO_MAIN_SOURCE_DIR}/scripts/install_dependencies"
-                           "${OpenVINO_MAIN_SOURCE_DIR}/scripts/demo"
-                           "${OpenVINO_MAIN_SOURCE_DIR}/ngraph"
-                           "${IE_MAIN_SOURCE_DIR}/scripts")
+                           "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/scripts/dependencies.sh"
+                           "${OpenVINO_MAIN_SOURCE_DIR}/scripts/install_dependencies/install_NEO_OCL_driver.sh"
+                           "${OpenVINO_MAIN_SOURCE_DIR}/scripts/install_dependencies/install_openvino_dependencies.sh"
+                           "${OpenVINO_MAIN_SOURCE_DIR}/ngraph/python/tests/test_onnx/model_zoo_preprocess.sh"
+                           )

 #
 # cpack
--- a/docs/ops/infrastructure/Parameter_1.md
+++ b/docs/ops/infrastructure/Parameter_1.md
@ -11,18 +11,27 @@
 * *element_type*

  * **Description**: the type of element of output tensor
-  * **Range of values**: u8, u16, u32, u64, i8, i16, i32, i64, f16, f32, boolean, bf16
-  * **Type**: string
+  * **Range of values**: u1, u4, u8, u16, u32, u64, i4, i8, i16, i32, i64, f16, f32, boolean, bf16
+  * **Type**: `string`
  * **Default value**: None
-  * **Required**: *Yes*
+  * **Required**: *yes*

 * *shape*

  * **Description**: the shape of the output tensor
-  * **Range of values**: list of non-negative integers, empty list is allowed that means 0D or scalar tensor
-  * **Type**: int[]
+  * **Range of values**: list of non-negative integers, empty list is allowed, which means 0D or scalar tensor
+  * **Type**: `int[]`
  * **Default value**: None
-  * **Required**: *Yes*
+  * **Required**: *yes*
+
+
+**Outputs**
+
+* **1**: Output tensor of type *T* and shape equal to *shape* attribute.
+
+**Types**
+
+* *T*: any type from *element type* values.

 **Example**

--- a/docs/ops/movement/ExtractImagePatches_3.md
+++ b/docs/ops/movement/ExtractImagePatches_3.md
@ -8,9 +8,7 @@

 **Detailed description**:

-The *ExtractImagePatches* operation is similar to the TensorFlow* operation [ExtractImagePatches](https://www.tensorflow.org/api_docs/python/tf/image/extract_patches).
-
-This op extracts patches of shape `sizes` which are `strides` apart in the input image. The output elements are taken from the input at intervals given by the `rate` argument, as in dilated convolutions.
+The *ExtractImagePatches* operation extracts patches of shape `sizes` which are `strides` apart in the input image. The output elements are taken from the input at intervals given by the `rate` argument, as in dilated convolutions.

 The result is a 4D tensor containing image patches with size `size[0] * size[1] * depth` vectorized in the "depth" dimension.

@ -92,20 +90,23 @@ The "auto_pad" attribute has no effect on the size of each patch, it determines
 Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We use the symbol `x` to mark output patches.

 1. `sizes="3,3", strides="5,5", rates="1,1", auto_pad="valid"`
-
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp; 4 &nbsp; 5 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp; 9 10    
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;14 15 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x 19 20  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;24 25 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x 29 30  
-  31 32 33 34 35 36 37 38 39 40  
-  41 42 43 44 45 46 47 48 49 50  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;54 55 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x 59 60  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;64 65 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x 69 70  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;74 75 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x 79 80  
-  81 82 83 84 85 86 87 88 89 90  
-  91 92 93 94 95 96 97 98 99 100  
+\f[
+    \begin{bmatrix}
+        x & x & x & 4 & 5 & x & x & x & 9 & 10 \\
+        x & x & x & 14 & 15 & x & x & x & 19 & 20 \\
+        x & x & x & 24 & 25 & x & x & x & 29 & 30 \\
+        31 & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 \\
+        41 & 42 & 43 & 44 & 45 & 46 & 47 & 48 & 49 & 50 \\
+        x & x & x & 54 & 55 & x & x & x & 59 & 60 \\
+        x & x & x & 64 & 65 & x & x & x & 69 & 70 \\
+        x & x & x & 74 & 75 & x & x & x & 79 & 80 \\
+        81 & 82 & 83 & 84 & 85 & 86 & 87 & 88 & 89 & 90 \\
+        91 & 92 & 93 & 94 & 95 & 96 & 79 & 98 & 99 & 100
+    \end{bmatrix}
+\f]

  output:  
-
+```
    [[[[ 1  6]
       [51 56]]

@ -132,24 +133,27 @@ Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We u

      [[23 28]
       [73 78]]]] 
-
+```
  output shape: `[1, 9, 2, 2]`
  
 2. `sizes="4,4", strides="8,8", rates="1,1", auto_pad="valid"`
-
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;&nbsp;5 &nbsp;&nbsp;6 &nbsp;&nbsp;7 &nbsp;&nbsp;8 &nbsp;&nbsp;9 10  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;15 16 17 18 19 20  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;25 26 27 28 29 30  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;35 36 37 38 39 40  
-  41 42 43 44 45 46 47 48 49 50  
-  51 52 53 54 55 56 57 58 59 60  
-  61 62 63 64 65 66 67 68 69 70  
-  71 72 73 74 75 76 77 78 79 80  
-  81 82 83 84 85 86 87 88 89 90  
-  91 92 93 94 95 96 97 98 99 100  
+\f[
+    \begin{bmatrix}
+        x & x & x & x & 5 & 6 & 7 & 8 & 9 & 10 \\
+        x & x & x & x & 15 & 16 & 17 & 18 & 19 & 20 \\
+        x & x & x & x & 25 & 26 & 27 & 28 & 29 & 30 \\
+        x & x & x & x & 35 & 36 & 37 & 38 & 39 & 40 \\
+        41 & 42 & 43 & 44 & 45 & 46 & 47 & 48 & 49 & 50 \\
+        51 & 52 & 53 & 54 & 55 & 56 & 57 & 58 & 59 & 60 \\
+        61 & 62 & 63 & 64 & 65 & 66 & 67 & 68 & 69 & 70 \\
+        71 & 72 & 73 & 74 & 75 & 76 & 77 & 78 & 79 & 80 \\
+        81 & 82 & 83 & 84 & 85 & 86 & 87 & 88 & 89 & 90 \\
+        91 & 92 & 93 & 94 & 95 & 96 & 79 & 98 & 99 & 100
+    \end{bmatrix}
+\f]
  
  output:  
-
+``` 
     [[[[ 1]]

      [[ 2]]
@ -181,27 +185,29 @@ Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We u
      [[33]]

      [[34]]]]
-      
+```
  output shape: `[1, 16, 1, 1]`

 3. `sizes="4,4", strides="9,9", rates="1,1", auto_pad="same_upper"`
-
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;&nbsp;4 &nbsp;&nbsp;5 &nbsp;&nbsp;6 &nbsp;&nbsp;7 &nbsp;&nbsp;8 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;14 15 16 17 18 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;24 25 26 27 28 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;0 31 32 33 34 35 36 37 38 39 40 &nbsp;&nbsp;0 &nbsp;&nbsp;0  
-   &nbsp;&nbsp;0 41 42 43 44 45 46 47 48 49 50 &nbsp;&nbsp;0 &nbsp;&nbsp;0  
-   &nbsp;&nbsp;0 51 52 53 54 55 56 57 58 59 60 &nbsp;&nbsp;0 &nbsp;&nbsp;0  
-   &nbsp;&nbsp;0 61 62 63 64 65 66 67 68 69 70 &nbsp;&nbsp;0 &nbsp;&nbsp;0  
-   &nbsp;&nbsp;0 71 72 73 74 75 76 77 78 79 80 &nbsp;&nbsp;0 &nbsp;&nbsp;0  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;84 85 86 87 88 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;94 95 96 97 98 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;0 &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;x    
-
+\f[
+    \begin{bmatrix}
+        x & x & x & x & 0 & 0 & 0 & 0 & 0 & x & x & x & x\\
+        x & x & x & x & 4 & 5 & 6 & 7 & 8 & x & x & x & x\\
+        x & x & x & x & 14 & 15 & 16 & 17 & 18 & x & x & x & x\\
+        x & x & x & x & 24 & 25 & 26 & 27 & 28 & x & x & x & x\\
+        0 & 31 & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 & 0 & 0\\
+        0 & 41 & 42 & 43 & 44 & 45 & 46 & 47 & 48 & 49 & 50 & 0 & 0\\
+        0 & 51 & 52 & 53 & 54 & 55 & 56 & 57 & 58 & 59 & 60 & 0 & 0\\
+        0 & 61 & 62 & 63 & 64 & 65 & 66 & 67 & 68 & 69 & 70 & 0 & 0\\
+        0 & 71 & 72 & 73 & 74 & 75 & 76 & 77 & 78 & 79 & 80 & 0 & 0\\
+        x & x & x & x & 84 & 85 & 86 & 87 & 88 & x & x & x & x\\
+        x & x & x & x & 94 & 95 & 96 & 79 & 98 & x & x & x & x\\
+        x & x & x & x & 0 & 0 & 0 & 0 & 0 & x & x & x & x\\
+        x & x & x & x & 0 & 0 & 0 & 0 & 0 & x & x & x & x
+    \end{bmatrix}
+\f]
  output:  
-
+```
    [[[[  0   0]
       [  0  89]]

@ -249,25 +255,28 @@ Image is a `1 x 1 x 10 x 10` array that contains the numbers 1 through 100. We u

      [[ 23   0]
       [  0   0]]]] 
-
+```
  output shape: `[1, 16, 2, 2]`

 4. `sizes="3,3", strides="5,5", rates="2,2", auto_pad="valid"`  
 This time we use the symbols `x`, `y`, `z` and `k` to distinguish the patches:
-  
-   &nbsp;&nbsp;x &nbsp;&nbsp;2 &nbsp;&nbsp;x &nbsp;&nbsp;4 &nbsp;&nbsp;x &nbsp;&nbsp;y &nbsp;&nbsp;7 &nbsp;&nbsp;y &nbsp;&nbsp;9 &nbsp;&nbsp;y  
-  11 12 13 14 15 16 17 18 19 20  
-   &nbsp;&nbsp;x &nbsp;22 &nbsp;&nbsp;x 24 &nbsp;&nbsp;x &nbsp;&nbsp;y 27 &nbsp;&nbsp;y 29 &nbsp;&nbsp;y  
-  31 32 33 34 35 36 37 38 39 40  
-   &nbsp;&nbsp;x &nbsp;42 &nbsp;&nbsp;x 44 &nbsp;&nbsp;x &nbsp;&nbsp;y 47 &nbsp;&nbsp;y 49 &nbsp;&nbsp;y  
-   &nbsp;&nbsp;z &nbsp;52 &nbsp;&nbsp;z 54 &nbsp;&nbsp;z &nbsp;&nbsp;k 57 &nbsp;&nbsp;k 59 &nbsp;&nbsp;k  
-  61 62 63 64 65 66 67 68 69 70  
-   &nbsp;&nbsp;z &nbsp;72 &nbsp;&nbsp;z 74 &nbsp;&nbsp;z &nbsp;&nbsp;k 77 &nbsp;&nbsp;k 79 &nbsp;&nbsp;k  
-  81 82 83 84 85 86 87 88 89 90  
-   &nbsp;&nbsp;z &nbsp;92 &nbsp;&nbsp;z 94 &nbsp;&nbsp;z &nbsp;&nbsp;k 97 &nbsp;&nbsp;k 99 &nbsp;&nbsp;k
+\f[
+    \begin{bmatrix}
+        x & 2 & x & 4 & x & y & 7 & y & 9 & y \\
+        11 & 12 & 13 & 14 & 15 & 16 & 17 & 18 & 19 & 20 \\
+        x & 22 & x & 24 & x & y & 27 & y & 29 & y \\
+        31 & 32 & 33 & 34 & 35 & 36 & 37 & 38 & 39 & 40 \\
+        x & 42 & x & 44 & x & y & 47 & y & 49 & y \\
+        z & 52 & z & 54 & z & k & 57 & k & 59 & k \\
+        61 & 62 & 63 & 64 & 65 & 66 & 67 & 68 & 69 & 70 \\
+        z & 72 & z & 74 & z & k & 77 & k & 79 & k \\
+        81 & 82 & 83 & 84 & 85 & 86 & 87 & 88 & 89 & 90 \\
+        z & 92 & z & 94 & z & k & 79 & k & 99 & k
+    \end{bmatrix}
+\f]

  output:
-
+```
    [[[[  1   6]
       [ 51  56]]

@ -294,26 +303,30 @@ This time we use the symbols `x`, `y`, `z` and `k` to distinguish the patches:

      [[ 45  50]
       [ 95 100]]]] 
-
+```
  output_shape: `[1, 9, 2, 2]`

 5. `sizes="2,2", strides="3,3", rates="1,1", auto_pad="valid"`  
 Image is a `1 x 2 x 5 x 5` array that contains two feature maps where feature map with coordinate 0 contains numbers in a range `[1, 25]` and feature map with coordinate 1 contains numbers in a range `[26, 50]`  

-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;&nbsp;3 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;6 &nbsp;&nbsp;7 &nbsp;&nbsp;8 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-  11 12 13 14 15  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;18 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;23 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;28 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;33 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-  36 37 38 39 40  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;43 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-   &nbsp;&nbsp;x &nbsp;&nbsp;x &nbsp;48 &nbsp;&nbsp;x &nbsp;&nbsp;x  
-
+\f[
+    \begin{bmatrix}
+        x & x & 3 & x & x\\
+        x & x & 8 & x & x\\
+        11 & 12 & 13 & 14 & 15\\
+        x & x & 18 & x & x\\
+        x & x & 23 & x & x
+    \end{bmatrix}\\
+    \begin{bmatrix}
+        x & x & 28 & x & x\\
+        x & x & 33 & x & x\\
+        36 & 37 & 38 & 39 & 40\\
+        x & x & 43 & x & x\\
+        x & x & 48 & x & x
+    \end{bmatrix}
+\f]
  output:
-
+```
    [[[[ 1  4]
       [16 19]]

@ -337,5 +350,5 @@ Image is a `1 x 2 x 5 x 5` array that contains two feature maps where feature ma

      [[32 35]
       [47 50]]]] 
-   
+``` 
  output shape: `[1, 8, 2, 2]`
--- a/docs/ops/movement/ShuffleChannels_1.md
+++ b/docs/ops/movement/ShuffleChannels_1.md
@ -8,12 +8,37 @@

 **Short description**: *ShuffleChannels* permutes data in the channel dimension of the input tensor.

+**Detailed description**:
+
+Input tensor of `data_shape` is always interpreted as 4D tensor with the following shape:
+
+        dim 0: data_shape[0] * data_shape[1] * ... * data_shape[axis-1]
+               (or 1 if axis == 0)
+        dim 1: group
+        dim 2: data_shape[axis] / group
+        dim 3: data_shape[axis+1] * data_shape[axis+2] * ... * data_shape[data_shape.size()-1]
+               (or 1 if axis points to last dimension)
+
+
+Trailing and leading to `axis` dimensions are flattened and reshaped back to the original shape after channels shuffling.
+
+
+The operation is equivalent to the following transformation of the input tensor `x` of shape `[N, C, H, W]` and `axis = 1`:
+
+\f[
+x' = reshape(x, [N, group, C / group, H * W])\\
+x'' = transpose(x', [0, 2, 1, 3])\\
+y = reshape(x'', [N, C, H, W])\\
+\f]
+
+where `group` is the layer attribute described below.
+
 **Attributes**:

 * *axis*

  * **Description**: *axis* specifies the index of a channel dimension.
-  * **Range of values**: an integer number in the range [-4, 3]
+  * **Range of values**: an integer number in the range `[-rank(data_shape), rank(data_shape) - 1]`
  * **Type**: `int`
  * **Default value**: 1
  * **Required**: *No*
@ -21,30 +46,22 @@
 * *group*

  * **Description**: *group* specifies the number of groups to split the channel dimension into. This number must evenly divide the channel dimension size.
-  * **Range of values**: a positive integer
+  * **Range of values**: a positive integer in the range `[1, data_shape[axis]]`
  * **Type**: `int`
  * **Default value**: 1
  * **Required**: *No*

 **Inputs**:

-*   **1**: 4D input tensor of any supported data type. Required.
+*   **1**: `data` input tensor of type *T* and rank greater or equal to 1. **Required.**

 **Outputs**:

-*   **1**: 4D input tensor with shape and element type as for the input tensor.
+*   **1**: Output tensor with element type *T* and same shape as the input tensor.

-**Mathematical Formulation**
+**Types**

-The operation is the equivalent with the following transformation of the input tensor *x* of shape *[N, C, H, W]*:
-
-```
-x' = reshape(x, [N, group, C / group, H * W])
-x'' = transpose(x', [0, 2, 1, 3])
-y = reshape(x'', [N, C, H, W])
-```
-
-where `group` is the layer parameter described above and the `axis = 1`.
+* *T*: any supported numeric type.

 **Example**

--- a/docs/ops/movement/SpaceToBatch_2.md
+++ b/docs/ops/movement/SpaceToBatch_2.md
@ -8,20 +8,20 @@

 **Detailed description**:

-The *SpaceToBatch* operation is similar to the TensorFlow* operation [SpaceToBatchND](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd)
-
 The operation is equivalent to the following transformation of the input tensor `data` of shape `[batch, D_1, D_2 ... D_{N - 1}]` and `block_shape`, `pads_begin`, `pads_end` of shapes `[N]` to *Y* output tensor.

-    Zero-pad the start and end of dimensions [D_0, ..., D_{N - 1}] of the input according to `pads_begin` and `pads_end`:
-    note: P_0 for batch dimension is expected to be 0 (no-padding).
-    x = [batch + P_0, D_1 + P_1, D_2 + P_2, ..., D_{N - 1} + P_{N - 1}], where P_i = pads_begin[i] + pads_end[i]
+Zero-pad the start and end of dimensions  \f$[D_0, \dots, D_{N - 1}]\f$ of the input according to `pads_begin` and `pads_end`:

-    note: B_0 for batch is ignored.
-    x' = reshape(x, [batch, (D_1 + P_1) / B_1, B_1, (D_2 + P_2) / B_2, B_2, ..., (D_{N - 1} + P_{N - 1}) / B_{N - 1}, B_{N - 1}]), where B_i = block_shape[i]
+\f[x = [batch + P_0, D_1 + P_1, D_2 + P_2, \dots, D_{N - 1} + P_{N - 1}]\f]
+\f[x' = reshape(x, [batch, \frac{D_1 + P_1}{B_1}, B_1, \frac{D_2 + P_2}{B_2}, B_2, \dots, \frac{D_{N - 1} + P_{N - 1}}{B_{N - 1}}, B_{N - 1}])\f]
+\f[x'' = transpose(x',  [2, 4, \dots, (N - 1) + (N - 1), 0, 1, 3, \dots, N + (N - 1)])\f]
+\f[y = reshape(x'', [batch \times B_1 \times \dots \times B_{N - 1}, \frac{D_1 + P_1}{B_1}, \frac{D_2 + P_2}{B_2}, \dots, \frac{D_{N - 1} + P_{N - 1}}{B_{N - 1}}]\f]

-    x'' = transpose(x',  [2, 4, ..., (N - 1) + (N - 1), 0, 1, 3, ..., N + (N - 1)])
-
-    y = reshape(x'', [batch * B_1 * ... * B_{N - 1}, (D_1 + P_1) / B_1, (D_2 + P_2) / B_2, ... , (D_{N - 1} + P_{N - 1}) / B_{N - 1}])
+where
+- \f$P_i\f$ = pads_begin[i] + pads_end[i]
+- \f$B_i\f$ = block_shape[i]
+- \f$P_0\f$ for batch dimension is expected to be 0 (no-padding)
+- \f$B_0\f$ for batch is ignored

 **Attributes**

@ -36,7 +36,7 @@ The operation is equivalent to the following transformation of the input tensor
 
 **Outputs**

-*   **1**: N-D tensor with shape `[batch * block_shape[0] * block_shape[1] * ... * block_shape[N - 1], (pads_begin[1] + D_1 + pads_end[1]) / block_shape[1], (pads_begin[2] + D_2 + pads_end[2]) / block_shape[2], ..., (pads_begin[N - 1] + D_{N - 1} + pads_end[N - 1]) / block_shape[N - 1]` of the same type as `data` input. 
+*   **1**: N-D tensor with shape `[batch * block_shape[0] * block_shape[1] * ... * block_shape[N - 1], (D_1 + pads_begin[1] + pads_end[1]) / block_shape[1], (D_2 + pads_begin[2] + pads_end[2]) / block_shape[2], ..., (D_{N -1} + pads_begin[N - 1] + pads_end[N - 1]) / block_shape[N - 1]` of the same type as `data` input. 

 **Types**

--- a/docs/ops/movement/StridedSlice_1.md
+++ b/docs/ops/movement/StridedSlice_1.md
@ -5,13 +5,12 @@
 **Category**: Data movement operation

 **Short description**: *StridedSlice* extracts a strided slice of a tensor. 
- It is similar to generalized array indexing in Python\*.

 **Attributes**

 *   *begin_mask*

-    * **Description**: *begin_mask* is a bit mask. *begin_mask[i]* equal to 1 means that the corresponding dimension of the `begin` input is ignored and the 'real' beginning of the tensor is used along corresponding dimension.
+    * **Description**: *begin_mask* is a bit mask. *begin_mask[i]* equal to `1` means that the corresponding dimension of the `begin` input is ignored and the 'real' beginning of the tensor is used along corresponding dimension.
    * **Range of values**: a list of `0`s and `1`s
    * **Type**: `int[]`
    * **Default value**: None
@ -19,7 +18,7 @@

 *   *end_mask*

-    * **Description**: *end_mask* is a bit mask. If *end_mask[i]* is 1, the corresponding dimension of the `end` input is ignored and the real 'end' of the tensor is used along corresponding dimension.
+    * **Description**: *end_mask* is a bit mask. If *end_mask[i]* is `1`, the corresponding dimension of the `end` input is ignored and the real 'end' of the tensor is used along corresponding dimension.
    * **Range of values**: a list of `0`s and `1`s
    * **Type**: `int[]`
    * **Default value**: None
@ -27,7 +26,7 @@

 *   *new_axis_mask*

-    * **Description**: *new_axis_mask* is a bit mask. If *new_axis_mask[i]* is 1, a length 1 dimension is inserted on the `i`-th position of input tensor.
+    * **Description**: *new_axis_mask* is a bit mask. If *new_axis_mask[i]* is `1`, a length 1 dimension is inserted on the `i`-th position of input tensor.
    * **Range of values**: a list of `0`s and `1`s
    * **Type**: `int[]`
    * **Default value**: `[0]`
@ -35,7 +34,7 @@

 *   *shrink_axis_mask*

-    * **Description**: *shrink_axis_mask* is a bit mask. If *shrink_axis_mask[i]* is 1, the dimension on the `i`-th position is deleted.
+    * **Description**: *shrink_axis_mask* is a bit mask. If *shrink_axis_mask[i]* is `1`, the dimension on the `i`-th position is deleted.
    * **Range of values**: a list of `0`s and `1`s
    * **Type**: `int[]`
    * **Default value**: `[0]`
@ -51,21 +50,83 @@

 **Inputs**:

-*   **1**: Multidimensional input tensor to be sliced. Required.
+*   **1**: `data` - input tensor to be sliced of type `T` and arbitrary shape. **Required.**

-*   **2**: `begin` input - 1D input tensor with begin indexes for input tensor slicing. Required.
-           Out-of-bounds values are silently clamped. If `begin_mask[i]` is 1, the value of `begin[i]` is ignored
-           and the range of the appropriate dimension starts from 0.
-           Negative values mean indexing starts from the end. For example, if `foo=[1,2,3]`, `begin[0]=-1` means `begin[0]=3`.
+*   **2**: `begin` - 1D tensor of type `T_IND` with begin indexes for input tensor slicing. **Required.**  
+    Out-of-bounds values are silently clamped. If `begin_mask[i]` is `1`, the value of `begin[i]` is ignored and the range of the appropriate dimension starts from `0`. Negative values mean indexing starts from the end. For example, if `data=[1,2,3]`, `begin[0]=-1` means `begin[0]=3`.

-*   **3**: `end` input - 1D input tensor with end indexes for input tensor slicing. Required.
-           Out-of-bounds values will be silently clamped. If `end_mask[i]` is 1, the value of `end[i]` is ignored
-           and the full range of the appropriate dimension is used instead.
-           Negative values mean indexing starts from the end. For example, if `foo=[1,2,3]`, `end[0]=-1` means `end[0]=3`.
+*   **3**: `end` - 1D tensor of type `T_IND` with end indexes for input tensor slicing. **Required.**  
+    Out-of-bounds values will be silently clamped. If `end_mask[i]` is `1`, the value of `end[i]` is ignored and the full range of the appropriate dimension is used instead. Negative values mean indexing starts from the end. For example, if `data=[1,2,3]`, `end[0]=-1` means `end[0]=3`.

-*   **4**: `stride` input - 1D input tensor with strides. Optional.
+*   **4**: `stride` - 1D tensor of type `T_IND` with strides. **Optional.**
+
+**Types**
+* *T*: any supported type.
+* *T_IND*: any supported integer type.

 **Example**  
+Example of `begin_mask` & `end_mask` usage.  
+```xml
+<layer ... type="StridedSlice" ...>
+    <data begin_mask="0,1,1" ellipsis_mask="0,0,0" end_mask="1,1,0" new_axis_mask="0,0,0" shrink_axis_mask="0,0,0"/>
+    <input>
+        <port id="0">
+            <dim>2</dim>
+            <dim>3</dim>
+            <dim>4</dim>
+        </port>
+        <port id="1">
+            <dim>2</dim> <!-- begin: [1, 0, 0] -->
+        </port>
+        <port id="2">
+            <dim>2</dim> <!-- end: [0, 0, 2] -->
+        </port>
+        <port id="3">
+            <dim>2</dim> <!-- stride: [1, 1, 1] -->
+        </port>
+    </input>
+    <output>
+        <port id="4">            
+            <dim>1</dim>
+            <dim>3</dim>
+            <dim>2</dim>
+        </port>
+    </output>
+</layer>
+```
+
+Example of `new_axis_mask` usage.
+```xml
+<layer ... type="StridedSlice" ...>
+    <data begin_mask="0,1,1" ellipsis_mask="0,0,0" end_mask="0,1,1" new_axis_mask="1,0,0" shrink_axis_mask="0,0,0"/>
+    <input>
+        <port id="0">
+            <dim>2</dim>
+            <dim>3</dim>
+            <dim>4</dim>
+        </port>
+        <port id="1">
+            <dim>2</dim>
+        </port>
+        <port id="2">
+            <dim>2</dim>
+        </port>
+        <port id="3">
+            <dim>2</dim>
+        </port>
+    </input>
+    <output>
+        <port id="4">
+            <dim>1</dim>
+            <dim>2</dim>
+            <dim>3</dim>
+            <dim>4</dim>
+        </port>
+    </output>
+</layer>
+```
+
+Example of `shrink_axis_mask` usage.
 ```xml
 <layer ... type="StridedSlice" ...>
    <data begin_mask="1,0,1,1,1" ellipsis_mask="0,0,0,0,0" end_mask="1,0,1,1,1" new_axis_mask="0,0,0,0,0" shrink_axis_mask="0,1,0,0,0"/>
--- a/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp
+++ b/docs/template_plugin/tests/functional/shared_tests_instances/single_layer_tests/reshape.cpp
@ -2,9 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "single_layer_tests/reshape.hpp"
+
 #include <vector>

-#include "single_layer_tests/reshape.hpp"
 #include "common_test_utils/test_constants.hpp"

 using namespace LayerTestsDefinitions;
@ -14,31 +15,45 @@ const std::vector<InferenceEngine::Precision> netPrecisions = {
    InferenceEngine::Precision::FP32,
 };

-INSTANTIATE_TEST_CASE_P(smoke_ReshapeCheckDynBatch, ReshapeLayerTest,
+INSTANTIATE_TEST_CASE_P(
+    smoke_ReshapeCheckDynBatch, ReshapeLayerTestRevise,
    ::testing::Combine(
-                ::testing::Values(true),
-                ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(true), ::testing::ValuesIn(netPrecisions),
        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
-                ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+        ::testing::Values(std::vector<int64_t>({30, 30, 30, 30})),
        ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE),
        ::testing::Values(std::map<std::string, std::string>({}))),
-                ReshapeLayerTest::getTestCaseName);
+    ReshapeLayerTestRevise::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_ReshapeCheck, ReshapeLayerTest,
+INSTANTIATE_TEST_CASE_P(
+    smoke_ReshapeCheck, ReshapeLayerTestRevise,
    ::testing::Combine(
-                ::testing::Values(true),
-                ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(true), ::testing::ValuesIn(netPrecisions),
        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::Values(InferenceEngine::Layout::ANY),
        ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
-                ::testing::Values(std::vector<size_t>({10, 0, 100})),
+        ::testing::Values(std::vector<int64_t>({10, 0, 100})),
        ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE),
        ::testing::Values(std::map<std::string, std::string>({}))),
-                ReshapeLayerTest::getTestCaseName);
+    ReshapeLayerTestRevise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(
+    smoke_ReshapeCheckNegative, ReshapeLayerTestRevise,
+    ::testing::Combine(
+        ::testing::Values(true), ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
+        ::testing::Values(std::vector<int64_t>({10, -1, 100})),
+        ::testing::Values(CommonTestUtils::DEVICE_TEMPLATE),
+        ::testing::Values(std::map<std::string, std::string>({}))),
+    ReshapeLayerTestRevise::getTestCaseName);
 }  // namespace
--- a/inference-engine/ie_bridges/python/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/CMakeLists.txt
@ -68,6 +68,10 @@ if(ENABLE_WHEEL)
    add_subdirectory(wheel)
 endif()

+if (NGRAPH_PYTHON_BUILD_ENABLE)
+    add_dependencies(ie_api _pyngraph)
+endif()
+
 # install

 ie_cpack_add_component(${PYTHON_VERSION})
--- a/inference-engine/scripts/run_tests_myriad_multistick.sh
+++ b/inference-engine/scripts/run_tests_myriad_multistick.sh
@ -55,7 +55,7 @@ fi

 if [[ "${APPS_TO_RUN}" -ge 4 ]] ; then
    # For more then 4 multidevice testing
-    for (( VAR = 4; VAR <= ${APPS_TO_RUN}; ++VAR )); do
+    for (( VAR = 4; VAR <= APPS_TO_RUN; ++VAR )); do
        ./${APP_NAME} --gtest_filter=*VPURegTest*YOLO*myriad* &
        pids+=" $!"
    done
--- a/inference-engine/src/cldnn_engine/CMakeLists.txt
+++ b/inference-engine/src/cldnn_engine/CMakeLists.txt
@ -33,7 +33,7 @@ target_include_directories(${TARGET_NAME} PRIVATE
        ${CMAKE_CURRENT_SOURCE_DIR}
        $<TARGET_PROPERTY:inference_engine_transformations,INTERFACE_INCLUDE_DIRECTORIES>
        ${CLDNN__OCL_ICD_INCDIRS}
-        ${CLDNN_TOP_FOLDER})
+        ${CLDNN_TOP_FOLDER}/api)

 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})

--- a/inference-engine/src/cldnn_engine/cldnn_common_utils.h
+++ b/inference-engine/src/cldnn_engine/cldnn_common_utils.h
@ -5,7 +5,7 @@
 #pragma once

 #include <ie_layouts.h>
-#include <api/layout.hpp>
+#include <cldnn/runtime/layout.hpp>

 #include "ngraph/type/element_type.hpp"

--- a/inference-engine/src/cldnn_engine/cldnn_config.h
+++ b/inference-engine/src/cldnn_engine/cldnn_config.h
@ -9,7 +9,7 @@

 #include "cldnn_custom_layer.h"

-#include <api/network.hpp>
+#include <cldnn/graph/network.hpp>

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/cldnn_custom_layer.h
+++ b/inference-engine/src/cldnn_engine/cldnn_custom_layer.h
@ -10,7 +10,7 @@
 #include <map>
 #include <ie_common.h>
 #include "pugixml.hpp"
-#include "api/tensor.hpp"
+#include "cldnn/runtime/tensor.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -81,6 +81,8 @@
 #include "cldnn_itt.h"
 #include "gpu/gpu_config.hpp"

+#include "cldnn/runtime/device_query.hpp"
+
 #ifdef __linux__
 # include <dlfcn.h>
 #endif
@ -117,13 +119,13 @@ struct clDNNEngine::impl {
 };

 cldnn::device_info clDNNEngine::GetDeviceInfo(const std::map<std::string, std::string> &config) const {
-    auto device_info = device_map.begin()->second.get_info();
+    auto device_info = device_map.begin()->second->get_info();
    if (config.find(PluginConfigParams::KEY_DEVICE_ID) != config.end()) {
        auto val = config.at(PluginConfigParams::KEY_DEVICE_ID);
        if (device_map.find(val) == device_map.end()) {
            IE_THROW() << "Invalid device ID: " << val;
        }
-        device_info = device_map.at(val).get_info();
+        device_info = device_map.at(val)->get_info();
    }

    return device_info;
@ -445,7 +447,8 @@ clDNNEngine::clDNNEngine() : m_defaultContext(nullptr) {
    RegisterPrimitives();
    // try loading clDNN engine and get info from it
    {
-        cldnn::device_query device_query;
+        // Set OCL runtime which should be always available
+        cldnn::device_query device_query(cldnn::engine_types::ocl, cldnn::runtime_types::ocl);
        device_map = device_query.get_available_devices();
    }
    // locate global custom kernel config
@ -851,8 +854,8 @@ auto StringRightTrim = [](std::string string, std::string substring, bool case_s
 };

 static float GetGOPS(cldnn::device_info info, cldnn::data_types dt) {
-    auto freqGHz = info.core_frequency / 1000.f;
-    auto numEUs = info.cores_count;
+    auto freqGHz = info.gpu_frequency / 1000.f;
+    auto numEUs = info.execution_units_count;
    auto opsPerComputeBlock = 0;
    auto computeBlockIPC = 1.0f;
    switch (dt) {
@ -894,8 +897,8 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st

    auto iter = device_map.find(device_id);
    auto device_info = iter != device_map.end() ?
-        iter->second.get_info() :
-        device_map.begin()->second.get_info();
+        iter->second->get_info() :
+        device_map.begin()->second->get_info();

    if (name == METRIC_KEY(SUPPORTED_METRICS)) {
        std::vector<std::string> metrics;
@ -931,7 +934,7 @@ Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::st
        gops[InferenceEngine::Precision::FP32] = GetGOPS(device_info, cldnn::data_types::f32);
        IE_SET_METRIC_RETURN(DEVICE_GOPS, gops);
    } else if (name == GPU_METRIC_KEY(EXECUTION_UNITS_COUNT)) {
-        IE_SET_METRIC_RETURN(GPU_EXECUTION_UNITS_COUNT, device_info.cores_count);
+        IE_SET_METRIC_RETURN(GPU_EXECUTION_UNITS_COUNT, device_info.execution_units_count);
    } else if (name == GPU_METRIC_KEY(UARCH_VERSION)) {
        std::stringstream s;
        if (device_info.gfx_ver.major == 0 && device_info.gfx_ver.minor == 0 && device_info.gfx_ver.revision == 0) {
--- a/inference-engine/src/cldnn_engine/cldnn_engine.h
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.h
@ -7,7 +7,7 @@
 #include <map>
 #include <string>
 #include <memory>
-#include <api/engine.hpp>
+#include <cldnn/runtime/engine.hpp>
 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
 #include <cpp_interfaces/interface/ie_iexecutable_network_internal.hpp>
 #include "cldnn_remote_context.h"
@ -22,7 +22,7 @@ class clDNNEngine : public InferenceEngine::IInferencePlugin,
    std::shared_ptr<impl> _impl;

    // key: device_id, value: cldnn device
-    std::map<std::string, cldnn::device> device_map;
+    std::map<std::string, cldnn::device::ptr> device_map;
    std::mutex engine_mutex;

    mutable CLDNNRemoteCLContext::Ptr m_defaultContext;
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@ -2,13 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include <list>
-#include <set>
-#include <unordered_set>

 #include "ie_metric_helpers.hpp"
-#include <api/cldnn.hpp>
-#include <api/data.hpp>
+#include <chrono>
+#include <cmath>
+#include <algorithm>
+
+#include "ie_metric_helpers.hpp"
 #include <chrono>
 #include <cmath>
 #include <algorithm>
@ -27,7 +27,6 @@
 #include "threading/ie_cpu_streams_executor.hpp"
 #include "cpp_interfaces/interface/ie_iinfer_request_internal.hpp"

-
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;

--- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
@ -2,22 +2,28 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include <cldnn/graph/network.hpp>
+#include <cldnn/runtime/profiling.hpp>
+
+#include "cldnn_graph.h"
+#include "simple_math.h"
+#include <cldnn/cldnn_config.hpp>
+#include "cldnn_infer_request.h"
+
+#include <description_buffer.hpp>
+#include <threading/ie_executor_manager.hpp>
+#include <exec_graph_info.hpp>
+
+#include <ie_ngraph_utils.hpp>
+#include <ngraph/variant.hpp>
+
 #include <list>
 #include <set>
 #include <unordered_set>
 #include <sstream>
-#include <api/cldnn.hpp>
-#include <api/network.hpp>
-#include <api/profiling.hpp>
-#include <api/custom_gpu_primitive.hpp>
 #include <chrono>
 #include <cmath>
 #include <algorithm>
-#include "cldnn_graph.h"
-#include "simple_math.h"
-#include <description_buffer.hpp>
-#include "cldnn_infer_request.h"
-#include <threading/ie_executor_manager.hpp>
 #include <fstream>
 #include <utility>
 #include <sys/types.h>
@ -71,12 +77,10 @@ void CLDNNGraph::Build() {
        for (int b = m_bv_sz - 1; b >= 0; b--) {
            auto network = BuildNetwork(m_program->GetCompiledProgram(b));
            m_networks.insert(m_networks.begin(), network);
-            GetEngine()->release_pending_memory(network->get_id());
        }
    } else {
        auto network = BuildNetwork(m_program->GetCompiledProgram());
        m_networks.emplace_back(network);
-        GetEngine()->release_pending_memory(network->get_id());
    }

    UpdateImplementationsMap();
@ -499,7 +503,7 @@ void CLDNNGraph::UpdatePerfStatistics() {
        }
    };

-    std::map<cldnn::primitive_id, cldnn::event> executedPrimitives = GetNetwork()->get_executed_primitives();
+    std::map<cldnn::primitive_id, cldnn::event::ptr> executedPrimitives = GetNetwork()->get_executed_primitives();
    auto allPrimitives = GetNetwork()->get_all_primitives();

    // Get profiling info for all layers
@ -521,7 +525,7 @@ void CLDNNGraph::UpdatePerfStatistics() {
        auto event = execIter->second;
        executedPrimitives.erase(execIter);

-        cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event.get_profiling_info()};
+        cldnn::instrumentation::profiling_info cldnnInfo{profiledID, event->get_profiling_info()};

        collectTimings(cldnnInfo, perfCount);
        perfCount.num++;
@ -534,7 +538,7 @@ void CLDNNGraph::UpdatePerfStatistics() {
            pcIter = perfMap.find(executedID.first);
            auto& perfCount = pcIter->second.second;

-            cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second.get_profiling_info()};
+            cldnn::instrumentation::profiling_info cldnnInfo{executedID.first, executedID.second->get_profiling_info()};

            collectTimings(cldnnInfo, perfCount);
            perfCount.num++;
@ -675,7 +679,7 @@ std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> CLDNNGraph::G
            executedPrimitives.find(primId) != executedPrimitives.end()) {
            auto event = executedPrimitives.at(primId);

-            cldnn::instrumentation::profiling_info cldnnInfo{primId, event.get_profiling_info()};
+            cldnn::instrumentation::profiling_info cldnnInfo{primId, event->get_profiling_info()};

            // Collect timings
            long long cpuTime = 0;
--- a/inference-engine/src/cldnn_engine/cldnn_graph.h
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.h
@ -17,8 +17,8 @@
 #include "ie_blob.h"
 #include "cpp/ie_cnn_network.h"

-#include <api/network.hpp>
-#include <api/topology.hpp>
+#include <cldnn/graph/network.hpp>
+#include <cldnn/graph/topology.hpp>

 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
 #include "cldnn_custom_layer.h"
@ -43,7 +43,7 @@ public:

    const Config& getConfig() const { return m_config; }
    InferenceEngine::gpu::ClContext::Ptr GetContext() { return m_context; }
-    std::shared_ptr<const cldnn::engine> GetEngine() const { return getContextImpl(m_context)->GetEngine(); }
+    std::shared_ptr<cldnn::engine> GetEngine() const { return getContextImpl(m_context)->GetEngine(); }
    int GetMaxDynamicBatchSize() const { return getConfig().max_dynamic_batch; }
    const std::map<std::string, cldnn::layout>& GetInputLayouts() const { return m_program->GetInputLayouts(); }
    size_t GetNetworksCount() const { return m_networks.size(); }
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@ -19,7 +19,7 @@ using namespace InferenceEngine;

 namespace CLDNNPlugin {

-const char CLDNNInferRequest::fp32_suffix[] = "_fp32";
+const char fp32_suffix[] = "_fp32";
 const char str_not_allocated[] = "Input data was not allocated.";
 const char cannot_set_compound[] = "cannot set compound blob: supported only for input pre-processing";
 const char wrong_nv12_blob[] = "NV12 input blob is expected for input with NV12 color format";
@ -110,7 +110,7 @@ Blob::Ptr CLDNNInferRequest::createOutputBlob(const TensorDesc& desc, uint8_t* m
    }
 }

-void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory& inputMem) {
+void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory::ptr inputMem) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::input_attach");
    auto impl = getContextImpl(m_graph->GetContext());
    impl->acquire_lock();
@ -127,150 +127,57 @@ void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory& in

 void CLDNNInferRequest::input_alloc(cldnn::primitive_id name, const cldnn::layout& layout) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::input_alloc");
-    cldnn::memory input_mem = cldnn::memory::allocate(*(m_graph->GetEngine()), layout);
+    cldnn::memory::ptr input_mem = m_graph->GetEngine()->allocate_memory(layout);
    input_attach(name, input_mem);
 }

-void CLDNNInferRequest::copyOutputData(const cldnn::memory& outputMemory,
-                                        Blob::Ptr bptr,
-                                        buf_info* bi) {
-    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyOutputData");
-    size_t n = (bi == nullptr) ? bptr->size() : bi->buf_size;
+template<typename T>
+void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi, cldnn::stream& stream) {
+    size_t n = (bi == nullptr) ? dst->size() : bi->buf_size;
    size_t offset = (bi == nullptr) ? 0 : bi->buf_offset;

-    auto layout = outputMemory.get_layout();
+    auto layout = src->get_layout();
    auto size = layout.size;
-    auto l_padd = layout.data_padding.lower_size();
-    auto u_padd = layout.data_padding.upper_size();

-    auto h_padding = u_padd.spatial[0] + l_padd.spatial[0];
-    auto v_padding_l = (h_padding + size.spatial[0]) * u_padd.spatial[1];
-    auto v_padding_u = (h_padding + size.spatial[0]) * l_padd.spatial[1];
-
-    auto locked = bptr->buffer();
-    switch (bptr->getTensorDesc().getPrecision()) {
-    case Precision::FP32: {
-        auto out_f = locked.as<float*>();
-        if (out_f == nullptr) {
+    auto locked_dst = dst->buffer();
+    auto dst_ptr = locked_dst.as<T*>();
+    if (dst_ptr == nullptr) {
        IE_THROW() << "Invalid output blob";
    }
-        auto resPtr = outputMemory.pointer<float>();
-        float *resVec = out_f + offset;
+    cldnn::mem_lock<T> src_lock{ src, stream };
+    T* src_ptr = src_lock.data();
+    dst_ptr += offset;

-        if (h_padding || v_padding_l || v_padding_u) {
-            size_t i = 0;
+    if (layout.data_padding) {
        for (size_t b = 0; b < size.batch[0]; b++) {
            for (size_t f = 0; f < size.feature[0]; f++) {
-                    i += v_padding_l;
+                for (size_t w = 0; w < size.spatial[3]; w++) {
+                    for (size_t z = 0; z < size.spatial[2]; z++) {
                        for (size_t y = 0; y < size.spatial[1]; y++) {
-                        i += l_padd.spatial[0];
-                        for (size_t x = 0; x < size.spatial[0]; x++, i++) {
-                            *resVec++ = resPtr[i];
+                            for (size_t x = 0; x < size.spatial[0]; x++) {
+                                *dst_ptr++ = src_ptr[layout.get_linear_offset(cldnn::tensor(b, f, x, y, z, w))];
+                            }
+                        }
                    }
-                        i += u_padd.spatial[0];
                }
-                    i += v_padding_u;
            }
        }
    } else {
        for (size_t i = 0; i < n; i++) {
-                resVec[i] = resPtr[i];
+            dst_ptr[i] = src_ptr[i];
        }
    }
 }
-    break;
-    case Precision::FP16: {
-        auto out_f = locked.as<uint16_t*>();
-        if (out_f == nullptr) {
-            IE_THROW() << "Invalid output blob";
-        }
-        auto resPtr = outputMemory.pointer<uint16_t>();
-        uint16_t* resVec = out_f + offset;

-        if (h_padding || v_padding_l || v_padding_u) {
-            size_t i = 0;
-            for (size_t b = 0; b < size.batch[0]; b++) {
-                for (size_t f = 0; f < size.feature[0]; f++) {
-                    i += v_padding_l;
-                    for (size_t y = 0; y < size.spatial[1]; y++) {
-                        i += l_padd.spatial[0];
-                        for (size_t x = 0; x < size.spatial[0]; x++, i++) {
-                            *resVec++ = resPtr[i];
-                        }
-                        i += u_padd.spatial[0];
-                    }
-                    i += v_padding_u;
-                }
-            }
-        } else {
-            for (size_t i = 0; i < n; i++) {
-                resVec[i] = resPtr[i];
-            }
-        }
-    }
-    break;
-    case Precision::I32: {
-        auto out_f = locked.as<int32_t*>();
-        if (out_f == nullptr) {
-            IE_THROW() << "Invalid output blob";
-        }
-        auto resPtr = outputMemory.pointer<int32_t>();
-        int32_t* resVec = out_f + offset;
-
-        if (h_padding || v_padding_l || v_padding_u) {
-            size_t i = 0;
-            for (size_t b = 0; b < size.batch[0]; b++) {
-                for (size_t f = 0; f < size.feature[0]; f++) {
-                    i += v_padding_l;
-                    for (size_t y = 0; y < size.spatial[1]; y++) {
-                        i += l_padd.spatial[0];
-                        for (size_t x = 0; x < size.spatial[0]; x++, i++) {
-                            *resVec++ = resPtr[i];
-                        }
-                        i += u_padd.spatial[0];
-                    }
-                    i += v_padding_u;
-                }
-            }
-        } else {
-            for (size_t i = 0; i < n; i++) {
-                resVec[i] = resPtr[i];
-            }
-        }
-    }
-    break;
-    case Precision::I64: {
-        auto out_f = locked.as<int64_t*>();
-        if (out_f == nullptr) {
-            IE_THROW() << "Invalid output blob";
-        }
-        auto resPtr = outputMemory.pointer<int64_t>();
-        int64_t* resVec = out_f + offset;
-
-        if (h_padding || v_padding_l || v_padding_u) {
-            size_t i = 0;
-            for (size_t b = 0; b < size.batch[0]; b++) {
-                for (size_t f = 0; f < size.feature[0]; f++) {
-                    i += v_padding_l;
-                    for (size_t y = 0; y < size.spatial[1]; y++) {
-                        i += l_padd.spatial[0];
-                        for (size_t x = 0; x < size.spatial[0]; x++, i++) {
-                            *resVec++ = resPtr[i];
-                        }
-                        i += u_padd.spatial[0];
-                    }
-                    i += v_padding_u;
-                }
-            }
-        } else {
-            for (size_t i = 0; i < n; i++) {
-                resVec[i] = resPtr[i];
-            }
-        }
-    }
-    break;
-    default:
-        IE_THROW() << "The plugin does not support output " << bptr->getTensorDesc().getPrecision() << " precision";
+void CLDNNInferRequest::copyOutputData(cldnn::memory::ptr src, Blob::Ptr dst, buf_info* bi) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyOutputData");
+    auto& stream = m_graph->GetNetwork()->get_stream();
+    switch (dst->getTensorDesc().getPrecision()) {
+    case Precision::FP32: copyResultToOutputBlob<float>(src, dst, bi, stream);    break;
+    case Precision::FP16: copyResultToOutputBlob<uint16_t>(src, dst, bi, stream); break;
+    case Precision::I32:  copyResultToOutputBlob<int32_t>(src, dst, bi, stream);  break;
+    case Precision::I64:  copyResultToOutputBlob<int64_t>(src, dst, bi, stream);  break;
+    default: IE_THROW(NotImplemented) << "The plugin does not support output " << dst->getTensorDesc().getPrecision() << " precision";
    }
 }

@ -279,7 +186,7 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr<cldnn::network> network,
                                      const cldnn::layout& inputLayout,
                                      const Blob &inputBlob, buf_info* bi) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyInputData");
-    size_t n = (bi == nullptr) ? inputBlob.size() : bi->buf_size;
+
    size_t offset = (bi == nullptr) ? 0 : bi->buf_offset;

    cldnn::primitive_id internalName = "parameter:" + inputName;
@ -287,37 +194,37 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr<cldnn::network> network,
    switch (inputBlob.getTensorDesc().getPrecision()) {
    case Precision::FP32: {
        float* blob_ptr = const_cast<float*>(locked.as<const float*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    case Precision::I32: {
        int32_t* blob_ptr = const_cast<int32_t*>(locked.as<const int32_t*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    case Precision::I64: {
        int64_t* blob_ptr = const_cast<int64_t*>(locked.as<const int64_t*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    case Precision::FP16: {
        uint16_t* blob_ptr = const_cast<uint16_t*>(locked.as<const uint16_t*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    case Precision::I8: {
        int8_t* blob_ptr = const_cast<int8_t*>(locked.as<const int8_t*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    case Precision::U8: {
        uint8_t* blob_ptr = const_cast<uint8_t*>(locked.as<const uint8_t*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    case Precision::BOOL: {
        uint8_t* blob_ptr = const_cast<uint8_t*>(locked.as<const uint8_t*>()) + offset;
-        network->set_input_data(internalName, cldnn::memory::attach(inputLayout, blob_ptr, n));
+        network->set_input_data(internalName, network->get_engine().attach_memory(inputLayout, blob_ptr));
        break;
    }
    default:
@ -601,6 +508,7 @@ void CLDNNInferRequest::SetBlob(const std::string& name, const Blob::Ptr &data)
 void CLDNNInferRequest::AllocateInputs() {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::AllocateInputs");
    auto inputLayouts = m_graph->GetInputLayouts();
+    auto& stream = m_graph->GetNetwork()->get_stream();
    // allocate inputs
    for (auto& ni : _networkInputs) {
        std::string name = ni.first;
@ -623,25 +531,24 @@ void CLDNNInferRequest::AllocateInputs() {
                input_alloc(UVName, inputLayouts.at(UVName));

                size_t height = desc.getDims()[2], width = desc.getDims()[3];
-                cldnn::pointer<uint8_t> input_mem_ptr_Y = inputsMemory.at(YName).pointer<uint8_t>();
+                cldnn::mem_lock<uint8_t> input_mem_ptr_Y{inputsMemory.at(YName), stream};
                TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC);
                auto blobY = createInputBlob(ydesc, input_mem_ptr_Y.data());

-                cldnn::pointer<uint8_t> input_mem_ptr_UV = inputsMemory.at(UVName).pointer<uint8_t>();
+                cldnn::mem_lock<uint8_t> input_mem_ptr_UV{ inputsMemory.at(UVName), stream };
                TensorDesc uvdesc(Precision::U8, { 1, 2, height / 2, width / 2 }, Layout::NHWC);
                auto blobUV = createInputBlob(uvdesc, input_mem_ptr_UV.data());

                blobs.push_back(make_shared_blob<NV12Blob>(blobY, blobUV));
            }
            _inputs[name] = desc.getDims()[0] == 1 ? blobs[0] : make_shared_blob<BatchedBlob>(blobs);
-
        } else {
            if (inputLayouts.find(name) == inputLayouts.end()) {
                IE_THROW() << "Input layout for " << name << " is not found";
            }
            cldnn::layout layout = inputLayouts.at(name);
            input_alloc(name, layout);
-            cldnn::pointer<uint8_t> mem_ptr = inputsMemory.at(name).pointer<uint8_t>();
+            cldnn::mem_lock<uint8_t> mem_ptr{inputsMemory.at(name), stream};
            _inputs[name] = createInputBlob(desc, mem_ptr.data());

            if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
@ -685,8 +592,8 @@ void CLDNNInferRequest::AllocateOutputs() {
    bool can_reuse_internal_mem = !m_useStreams;
    for (auto& no : _networkOutputs) {
        std::string outputID = m_graph->MapOutputName(no.first);
-        cldnn::memory output_mem = m_graph->GetNetwork()->get_output_memory(outputID);
-        cldnn::pointer<uint8_t> output_mem_ptr = output_mem.pointer<uint8_t>();
+        cldnn::memory::ptr output_mem = m_graph->GetNetwork()->get_output_memory(outputID);
+        cldnn::mem_lock<uint8_t> output_mem_ptr{output_mem, m_graph->GetNetwork()->get_stream()};
        if (output_mem_ptr.data() == nullptr) {
            IE_THROW() << "Empty output memory for primitive " << outputID;
        }
@ -824,6 +731,7 @@ CLDNNInferRequest::CLDNNInferRequest(InputsDataMap networkInputs, OutputsDataMap
 void CLDNNInferRequest::execAndParse() {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::execAndParse");
    auto networkOutputs = m_graph->GetNetwork()->execute();
+    auto& stream = m_graph->GetNetwork()->get_stream();

    // Collect outputs as requested by the model
    for (auto& no : _networkOutputs) {
@ -835,12 +743,12 @@ void CLDNNInferRequest::execAndParse() {
        // mapping remote blobs not needed -
        // let the user take care of them explicitly
        if (!bptr->is<gpu::ClBlob>()) {
-            auto out_ptr = outputMemory.pointer<uint8_t>();
+            cldnn::mem_lock<uint8_t> out_ptr{outputMemory, stream};
            auto blob_ptr = bptr->buffer().as<uint8_t*>();

            // If Async API is used, copy of output blobs is not needed, unless SetBlob function was called.
            // But in the case when old API is used we have to copy data to memory provided by user.
-            if (blob_ptr != &out_ptr[0]) {
+            if (blob_ptr != out_ptr.data()) {
                copyOutputData(outputMemory, bptr);
            }
        }
@ -965,19 +873,20 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const
        IE_THROW() << "Input name mismatch.";
    }
    auto inputLayout = m_graph->GetInputLayouts().at(inputName);
-    auto is_same_buffer = [](const Blob& blob, const cldnn::memory& memory) -> bool {
+    auto is_same_buffer = [&](const Blob& blob, cldnn::memory::ptr memory) -> bool {
        const std::string str_not_allocated("Input data was not allocated.");
-        cldnn::pointer<const uint8_t> ptr = memory.pointer<const uint8_t>();
+        cldnn::mem_lock<uint8_t> ptr{memory, m_graph->GetNetwork()->get_stream()};
        const uint8_t* blob_ptr = blob.cbuffer().as<const uint8_t*>();
        const uint8_t* mem_ptr = ptr.data();
        if (blob_ptr == nullptr || mem_ptr == nullptr) {
            IE_THROW() << str_not_allocated;
        }
-        return (blob_ptr == mem_ptr) && (blob.byteSize() == memory.size());
+        return (blob_ptr == mem_ptr) && (blob.byteSize() == memory->size());
    };

    cldnn::primitive_id internalName = "parameter:" + inputName;
-    const cldnn::memory& memory = inputsMemory.at(inputName);
+    cldnn::memory::ptr memory = inputsMemory.at(inputName);
+    auto& stream = m_graph->GetNetwork()->get_stream();
    auto _nw_ptr = m_graph->GetNetwork();
    auto prec = inputBlob.getTensorDesc().getPrecision();

@ -986,8 +895,8 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const
        _nw_ptr->set_input_data(internalName, memory);
    } else if (prec == Precision::I16 || prec == Precision::U16) {
        // clDNN doesn't support I16 input precision, so we always have to convert input data to fp32 precision
-        const cldnn::memory& fp32_mem = inputsMemory.at(inputName+fp32_suffix);
-        cldnn::pointer<float> ptr = fp32_mem.pointer<float>();
+        cldnn::memory::ptr fp32_mem = inputsMemory.at(inputName+fp32_suffix);
+        cldnn::mem_lock<float> ptr {fp32_mem, stream};
        if (prec == Precision::I16) {
            copyToFloat<int16_t>(ptr.data(), &inputBlob);
        } else {
@ -1031,4 +940,4 @@ void CLDNNInferRequest::PrepareInputDyn(const cldnn::primitive_id &inputName, co
    }
 }

-};  // namespace CLDNNPlugin
+}  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.h
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.h
@ -46,7 +46,7 @@ public:
    void EnableStreams() { m_useStreams = true; }

 protected:
-    std::map<std::string, cldnn::memory> inputsMemory;
+    std::map<std::string, cldnn::memory::ptr> inputsMemory;
    std::map<std::string, cldnn::primitive_id> outputsMap;

    bool m_useProfiling;
@ -60,12 +60,12 @@ protected:

    InferenceEngine::Blob::Ptr createInputBlob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr);
    InferenceEngine::Blob::Ptr createOutputBlob(const InferenceEngine::TensorDesc& desc, uint8_t* mem_ptr = nullptr);
-    void copyOutputData(const cldnn::memory& outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr);
+    void copyOutputData(cldnn::memory::ptr outputMemory, InferenceEngine::Blob::Ptr bptr, buf_info* bi = nullptr);
    void copyInputData(std::shared_ptr<cldnn::network> network, const cldnn::primitive_id &inputName,
                       const cldnn::layout& inputLayout, const InferenceEngine::Blob &inputBlob,
                       buf_info* bi = nullptr);

-    void input_attach(cldnn::primitive_id name, cldnn::memory& inputMem);
+    void input_attach(cldnn::primitive_id name, cldnn::memory::ptr inputMem);
    void input_alloc(cldnn::primitive_id name, const cldnn::layout& layout);
    void AllocateInputs();
    void AllocateOutputs();
@ -76,9 +76,6 @@ protected:

    void PrepareInput(const cldnn::primitive_id &inputName, const InferenceEngine::Blob &inputBlob);
    void PrepareInputDyn(const cldnn::primitive_id &inputName, const InferenceEngine::Blob &inputBlob);
-
-private:
-    static const char fp32_suffix[];
 };

 };  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@ -92,7 +92,7 @@ bool Program::CanProcessDynBatch(std::vector<std::shared_ptr<ngraph::Node>> ops,
    return true;
 }

-Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<const cldnn::engine> engine, const Config& config, bool createTopologyOnly)
+Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config, bool createTopologyOnly)
    : m_config(config)
    , m_engine(engine)
    , m_curBatch(-1)
@ -128,11 +128,9 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<const cld

            ChangeInputBatch(1U << static_cast<unsigned>(b));
            m_programs.insert(m_programs.begin(), BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly));
-            m_engine->release_pending_memory(0);
        }
    } else {
        m_programs.emplace_back(BuildProgram(ops, networkInputs, networkOutputs, createTopologyOnly));
-        m_engine->release_pending_memory(0);
    }
 }

--- a/inference-engine/src/cldnn_engine/cldnn_program.h
+++ b/inference-engine/src/cldnn_engine/cldnn_program.h
@ -15,8 +15,8 @@

 #include "cldnn_config.h"

-#include <api/engine.hpp>
-#include <api/topology.hpp>
+#include <cldnn/runtime/engine.hpp>
+#include <cldnn/graph/topology.hpp>

 // Forward declarations for cldnn part
 namespace cldnn {
@ -69,8 +69,8 @@ public:

 class Program {
 public:
-    Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<const cldnn::engine> engine, const Config& config, bool createTopologyOnly = false);
-    Program(std::shared_ptr<const cldnn::engine> engine, const Config& config) : m_config(config), m_engine(engine),
+    Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config, bool createTopologyOnly = false);
+    Program(std::shared_ptr<cldnn::engine> engine, const Config& config) : m_config(config), m_engine(engine),
            m_curBatch(-1), queryMode(false), m_max_batch(1) {}
    Program() : m_config({}), m_engine(nullptr), m_curBatch(-1), queryMode(false), m_max_batch(1) {}

@ -100,8 +100,8 @@ public:
    const std::map<std::string, cldnn::layout>& GetInputLayouts() const { return inputLayouts; }
    InferenceEngine::InputsDataMap GetNetworkInputs() const { return m_networkInputs; }
    InferenceEngine::OutputsDataMap GetNetworkOutputs() const { return m_networkOutputs; }
-    const cldnn::engine& GetEngine() const { return *m_engine; }
-    std::shared_ptr<const cldnn::engine> GetEnginePtr() const { return m_engine; }
+    cldnn::engine& GetEngine() const { return *m_engine; }
+    std::shared_ptr<cldnn::engine> GetEnginePtr() const { return m_engine; }
    const Config& GetConfig() const { return m_config; }
    int GetMaxBatchSizeForSingleProgram();

@ -150,7 +150,7 @@ public:
 private:
    static factories_map_t factories_map;
    std::vector<std::shared_ptr<cldnn::program>> m_programs;
-    std::shared_ptr<const cldnn::engine> m_engine;
+    std::shared_ptr<cldnn::engine> m_engine;
    Config m_config;

    std::shared_ptr<cldnn::topology> m_topology;
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@ -6,21 +6,23 @@
 #include "cldnn_remote_context.h"
 #include "cldnn_itt.h"

+#include "cldnn/runtime/device_query.hpp"
+
 using namespace InferenceEngine;
 using namespace InferenceEngine::gpu;
 using namespace InferenceEngine::details;

 namespace CLDNNPlugin {
-static const char unsupported_str[] = "Unsupported shared object type ";
 CLDNNRemoteAllocator CLDNNRemoteBlobImpl::m_allocator;

 CLDNNRemoteBlobImpl::CLDNNRemoteBlobImpl(ClContext::Ptr context,
+    cldnn::stream& stream,
    const cldnn::layout& layout,
    cldnn::shared_handle mem,
    cldnn::shared_surface surf,
    uint32_t plane,
    BlobType mem_type) :
-    m_context(context), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
+    m_context(context), m_stream(stream), m_layout(layout), m_mem_type(mem_type), m_mem(mem), m_surf(surf), m_plane(plane),
    _handle(nullptr), _allocator(nullptr), m_memObject(nullptr), lockedHolder(nullptr) {
 }

@ -67,7 +69,6 @@ ParamMap CLDNNRemoteBlobImpl::getParams() const {
 }

 bool CLDNNRemoteBlobImpl::deallocate() noexcept {
-    if (m_memObject != nullptr)
    m_memObject.reset();
    return m_memObject == nullptr;
 }
@ -86,32 +87,7 @@ void CLDNNRemoteBlobImpl::allocate_if_needed() {
    _impl->acquire_lock();

    if (m_memObject == nullptr) {
-        auto eng = _impl->GetEngine();
-        switch (m_mem_type) {
-        case BlobType::BT_BUF_INTERNAL:
-            m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::allocate(*eng, m_layout)));
-            break;
-        case BlobType::BT_BUF_SHARED:
-            m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_buffer(*eng, m_layout, m_mem)));
-            break;
-#ifdef _WIN32
-        case BlobType::BT_SURF_SHARED:
-            m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_mem, m_plane)));
-            break;
-        case BlobType::BT_DX_BUF_SHARED:
-            m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_dx_buffer(*eng, m_layout, m_mem)));
-            break;
-#else
-        case BlobType::BT_SURF_SHARED:
-            m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_surf, m_plane)));
-            break;
-#endif
-        case BlobType::BT_IMG_SHARED:
-            m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_image(*eng, m_layout, m_mem)));
-            break;
-        default:
-            IE_THROW() << unsupported_str << m_mem_type;
-        }
+        allocate();
    }

    _impl->release_lock();
@ -120,32 +96,38 @@ void CLDNNRemoteBlobImpl::allocate_if_needed() {
 void CLDNNRemoteBlobImpl::allocate() noexcept {
    assert(m_memObject == nullptr);

-    std::shared_ptr<const cldnn::engine> eng = getContextImpl(m_context.lock())->GetEngine();
+    std::shared_ptr<cldnn::engine> eng = getContextImpl(m_context.lock())->GetEngine();

    switch (m_mem_type) {
-    case BlobType::BT_BUF_INTERNAL:
-        m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::allocate(*eng, m_layout)));
+    case BlobType::BT_BUF_INTERNAL: {
+        m_memObject = eng->allocate_memory(m_layout);
        break;
-    case BlobType::BT_BUF_SHARED:
-        m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_buffer(*eng, m_layout, m_mem)));
+    }
+    case BlobType::BT_BUF_SHARED: {
+        m_memObject = eng->share_buffer(m_layout, m_mem);
        break;
+    }
 #ifdef _WIN32
-    case BlobType::BT_SURF_SHARED:
-        m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_mem, m_plane)));
+    case BlobType::BT_SURF_SHARED: {
+        m_memObject = eng->share_surface(m_layout, m_mem, m_plane);
        break;
-    case BlobType::BT_DX_BUF_SHARED:
-        m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_dx_buffer(*eng, m_layout, m_mem)));
+    }
+    case BlobType::BT_DX_BUF_SHARED: {
+        m_memObject = eng->share_dx_buffer(m_layout, m_mem);
        break;
+    }
 #else
-    case BlobType::BT_SURF_SHARED:
-        m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_surface(*eng, m_layout, m_surf, m_plane)));
+    case BlobType::BT_SURF_SHARED: {
+        m_memObject = eng->share_surface(m_layout, m_surf, m_plane);
        break;
+    }
 #endif
-    case BlobType::BT_IMG_SHARED:
-        m_memObject = std::unique_ptr<cldnn::memory>(new cldnn::memory(cldnn::memory::share_image(*eng, m_layout, m_mem)));
+    case BlobType::BT_IMG_SHARED: {
+        m_memObject = eng->share_image(m_layout, m_mem);
        break;
+    }
    default:
-        m_memObject = nullptr;
+        m_memObject.reset();
    }
 }

@ -165,7 +147,7 @@ std::shared_ptr<RemoteContext> CLDNNRemoteBlobImpl::getContext() const noexcept
 }

 void CLDNNRemoteBlobImpl::lock() const {
-    lockedHolder = std::unique_ptr<cldnn::pointer<uint8_t>>(new cldnn::pointer<uint8_t>(m_memObject->pointer<uint8_t>()));
+    lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
    auto ptr = lockedHolder->data();
    _handle = reinterpret_cast<void*>(ptr);
    m_allocator.regLockedBlob(_handle, this);
@ -244,7 +226,11 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
        }
    }

-    cldnn::device_query device_query(_context_id, _va_device);
+    // TODO: Parameterize this based on plugin config and compilation options
+    auto engine_type = cldnn::engine_types::ocl;
+    auto runtime_type = cldnn::runtime_types::ocl;
+    // Use actual runtime and engine types
+    cldnn::device_query device_query(engine_type, runtime_type, _context_id, _va_device);
    auto device_map = device_query.get_available_devices();

    auto iter = device_map.find(m_config.device_id);
@ -252,28 +238,25 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe

    {
        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecutionContextImpl::Create");
-        m_engine = std::make_shared<cldnn::engine>(dev,
-            cldnn::engine_configuration((m_config.useProfiling ||
+        bool enable_profiling = (m_config.useProfiling ||
                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) ||
-                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache)),
-                false,
-                m_config.dumpCustomKernels,
-                std::string(),
-                std::string(),
-                true,
-                std::string(),
+                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache));
+        cldnn::queue_types queue_type = cldnn::queue_types::out_of_order;
+        bool use_unified_shared_memory = true;
+        m_engine = cldnn::engine::create(engine_type, runtime_type, dev, cldnn::engine_configuration(enable_profiling,
+                                                                                                     queue_type,
                                                                                                     m_config.sources_dumps_dir,
                                                                                                     m_config.queuePriority,
                                                                                                     m_config.queueThrottle,
                                                                                                     m_config.memory_pool_on,
-                m_config.throughput_streams,
+                                                                                                     use_unified_shared_memory,
                                                                                                     m_config.kernels_cache_dir,
                                                                                                     m_config.n_threads));
    }
 }

 ParamMap CLDNNExecutionContextImpl::getParams() const {
-    ParamMap ret = { { GPU_PARAM_KEY(OCL_CONTEXT), m_engine->get_context() } };
+    ParamMap ret = { { GPU_PARAM_KEY(OCL_CONTEXT), m_engine->get_user_context() } };

    switch (m_type) {
    case OCL:
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.h
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.h
@ -4,15 +4,11 @@

 #pragma once

-#include <string>
-#include <map>
-#include <memory>
-#include <atomic>
+#include <cldnn/runtime/memory.hpp>
+#include <cldnn/runtime/engine.hpp>
 #include <ie_parameter.hpp>
 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
 #include "cldnn_config.h"
-#include <api/memory.hpp>
-#include <api/engine.hpp>
 #include "cldnn_common_utils.h"

 #ifndef NOMINMAX
@ -25,6 +21,11 @@
 # include <gpu/gpu_context_api_va.hpp>
 #endif

+#include <string>
+#include <map>
+#include <memory>
+#include <atomic>
+
 namespace CLDNNPlugin {
 class CLDNNRemoteAllocator;

@ -41,6 +42,7 @@ public:
    };

    explicit CLDNNRemoteBlobImpl(InferenceEngine::gpu::ClContext::Ptr context,
+                                 cldnn::stream& stream,
                                 const cldnn::layout& layout,
                                 cldnn::shared_handle mem,
                                 cldnn::shared_surface surf,
@ -63,11 +65,12 @@ public:
    bool is_allocated() const noexcept;
    bool is_locked() const noexcept;
    void allocate_if_needed();
-    cldnn::memory& getMemory() { return *m_memObject; }
+    cldnn::memory::ptr getMemory() { return m_memObject; }

 protected:
    static CLDNNRemoteAllocator m_allocator;
    std::weak_ptr<InferenceEngine::gpu::ClContext> m_context;
+    cldnn::stream& m_stream;

    // constructor stuff
    cldnn::shared_handle m_mem;
@ -77,9 +80,9 @@ protected:
    cldnn::layout m_layout;
    BlobType m_mem_type;

-    std::unique_ptr<cldnn::memory> m_memObject;
+    cldnn::memory::ptr m_memObject;

-    mutable std::unique_ptr<cldnn::pointer<uint8_t>> lockedHolder;
+    mutable std::unique_ptr<cldnn::mem_lock<uint8_t>> lockedHolder;
    mutable void* _handle;
    mutable std::shared_ptr<InferenceEngine::IAllocator> _allocator;

@ -93,13 +96,14 @@ public:
    using Ptr = std::shared_ptr<typedCLDNNRemoteBlob>;

    explicit typedCLDNNRemoteBlob(InferenceEngine::gpu::ClContext::Ptr context,
+                                  cldnn::stream& stream,
                                  const InferenceEngine::TensorDesc& desc,
                                  const cldnn::layout& layout,
                                  cldnn::shared_handle mem,
                                  cldnn::shared_surface surf,
                                  uint32_t plane,
                                  CLDNNRemoteBlobImpl::BlobType mem_type)
-        : _impl(context, layout, mem, surf, plane, mem_type)
+        : _impl(context, stream, layout, mem, surf, plane, mem_type)
        , TpublicAPI(desc) {}

    void allocate() noexcept override { _impl.allocate(); }
@ -231,6 +235,7 @@ public:
    }

 protected:
+    // TODO: refactor to unique_ptr
    std::shared_ptr<cldnn::engine> m_engine;
    InferenceEngine::gpu_handle_param m_va_display;
    Config m_config;
@ -267,6 +272,7 @@ class typedCLDNNExecutionContext : public TpublicContextAPI,
        using namespace InferenceEngine;
        using InferenceEngine::gpu::details::param_map_obj_getter;
        InferenceEngine::RemoteBlob::Ptr ret = nullptr;
+        auto& stream = _impl.GetEngine()->get_program_stream();
        uint32_t plane = param_map_obj_getter::_ObjFromParamSimple<uint32_t>(params, GPU_PARAM_KEY(VA_PLANE));
 #ifdef _WIN32
        cldnn::shared_handle mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
@ -290,11 +296,11 @@ class typedCLDNNExecutionContext : public TpublicContextAPI,
                std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>
                (std::enable_shared_from_this<typedCLDNNExecutionContext<TpublicContextAPI>>::shared_from_this());
 #ifdef _WIN32
-            ret = std::make_shared<CLDNNRemoteD3DSurface>(smart_this,
+            ret = std::make_shared<CLDNNRemoteD3DSurface>(smart_this, stream,
                tensorDesc, layout, mem, 0, plane,
                CLDNNRemoteBlobImpl::BlobType::BT_SURF_SHARED);
 #else
-            ret = std::make_shared<CLDNNRemoteVASurface>(smart_this,
+            ret = std::make_shared<CLDNNRemoteVASurface>(smart_this, stream,
                tensorDesc, layout, nullptr, surf, plane,
                CLDNNRemoteBlobImpl::BlobType::BT_SURF_SHARED);
 #endif
@ -311,6 +317,7 @@ class typedCLDNNExecutionContext : public TpublicContextAPI,
        InferenceEngine::RemoteBlob::Ptr ret = nullptr;

        _impl.acquire_lock();
+        auto& stream = _impl.GetEngine()->get_program_stream();

        // try to locate previously shared object
        auto itr = shared_obj_reg.find(mem);
@ -327,15 +334,15 @@ class typedCLDNNExecutionContext : public TpublicContextAPI,

            switch (blob_type) {
            case CLDNNRemoteBlobImpl::BlobType::BT_BUF_SHARED:
-                ret = std::make_shared<CLDNNRemoteCLbuffer>(smart_this, tensorDesc, layout, mem, 0, 0, blob_type);
+                ret = std::make_shared<CLDNNRemoteCLbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
                break;
            case CLDNNRemoteBlobImpl::BlobType::BT_IMG_SHARED:
                layout.format = ImageFormatFromLayout(tensorDesc.getLayout());
-                ret = std::make_shared<CLDNNRemoteCLImage2D>(smart_this, tensorDesc, layout, mem, 0, 0, blob_type);
+                ret = std::make_shared<CLDNNRemoteCLImage2D>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
                break;
 #ifdef _WIN32
            case CLDNNRemoteBlobImpl::BlobType::BT_DX_BUF_SHARED:
-                ret = std::make_shared<CLDNNRemoteD3DBuffer>(smart_this, tensorDesc, layout, mem, 0, 0, blob_type);
+                ret = std::make_shared<CLDNNRemoteD3DBuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
                break;
 #endif
            default:
@ -354,7 +361,9 @@ class typedCLDNNExecutionContext : public TpublicContextAPI,
                             CldnnTensorFromIEDims(tensorDesc.getDims()));
        auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>
            (std::enable_shared_from_this<typedCLDNNExecutionContext<TpublicContextAPI>>::shared_from_this());
+        auto& stream = _impl.GetEngine()->get_program_stream();
        return std::make_shared<CLDNNRemoteCLbuffer>(smart_this,
+                                                     stream,
                                                     tensorDesc,
                                                     layout,
                                                     nullptr, 0, 0,
--- a/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp
+++ b/inference-engine/src/cldnn_engine/ops/batch_to_space.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/batch_to_space.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/batch_to_space.hpp"
+#include "cldnn/primitives/batch_to_space.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/broadcast.cpp
+++ b/inference-engine/src/cldnn_engine/ops/broadcast.cpp
@ -8,9 +8,9 @@
 #include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/broadcast.hpp"
-#include "api/reorder.hpp"
-#include "api/reshape.hpp"
+#include "cldnn/primitives/broadcast.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/reshape.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/concat.cpp
+++ b/inference-engine/src/cldnn_engine/ops/concat.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/concat.hpp"

-#include "api/concatenation.hpp"
+#include "cldnn/primitives/concatenation.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/constant.cpp
+++ b/inference-engine/src/cldnn_engine/ops/constant.cpp
@ -17,7 +17,7 @@
 #include "ngraph/op/variadic_split.hpp"
 #include "ngraph/op/util/op_types.hpp"

-#include "api/data.hpp"
+#include "cldnn/primitives/data.hpp"

 namespace CLDNNPlugin {

@ -169,9 +169,10 @@ void CreateConstantOp(Program& p, const std::shared_ptr<ngraph::op::v0::Constant
    if (bufIter != p.blobMemCache.end()) {
        constPrimID = bufIter->second;
    } else {
-        auto mem = cldnn::memory::allocate(p.GetEngine(), constLayout, 0, false);
-        auto tmpPointer = mem.pointer<char>();  // implicitly maps buffer - unmap in destructor
-        auto buf = tmpPointer.data();
+        cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false);
+        auto& stream = p.GetEngine().get_program_stream();
+        cldnn::mem_lock<char> lock{mem, stream};
+        auto buf = lock.data();
        auto bufSize = constLayout.bytes_count();

        // Do actual weights reorder and change O and I channels order
--- a/inference-engine/src/cldnn_engine/ops/convert.cpp
+++ b/inference-engine/src/cldnn_engine/ops/convert.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/convert.hpp"
 #include "ngraph/op/convert_like.hpp"

-#include "api/reorder.hpp"
+#include "cldnn/primitives/reorder.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/convolution.cpp
+++ b/inference-engine/src/cldnn_engine/ops/convolution.cpp
@ -13,11 +13,11 @@
 #include "ngraph/op/fake_quantize.hpp"
 #include "ngraph/op/util/op_types.hpp"

-#include "api/convolution.hpp"
-#include "api/deconvolution.hpp"
-#include "api/binary_convolution.hpp"
-#include "api/permute.hpp"
-#include "api/reorder.hpp"
+#include "cldnn/primitives/convolution.hpp"
+#include "cldnn/primitives/deconvolution.hpp"
+#include "cldnn/primitives/binary_convolution.hpp"
+#include "cldnn/primitives/permute.hpp"
+#include "cldnn/primitives/reorder.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp
+++ b/inference-engine/src/cldnn_engine/ops/ctc_greedy_decoder.cpp
@ -8,9 +8,9 @@
 #include "ngraph/op/ctc_greedy_decoder.hpp"
 #include "ngraph/op/ctc_greedy_decoder_seq_len.hpp"

-#include "api/ctc_greedy_decoder.hpp"
-#include "api/reorder.hpp"
-#include "api/mutable_data.hpp"
+#include "cldnn/primitives/ctc_greedy_decoder.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/mutable_data.hpp"

 #include "transformations/utils/utils.hpp"

@ -58,7 +58,7 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr<ngraph::No

    std::size_t num_output = op->get_output_size();

-    std::vector<cldnn::memory> shared_memory;
+    std::vector<cldnn::memory::ptr> shared_memory;
    if (num_output == 2) {
        auto mutable_precision = op->get_output_element_type(1);
         if (mutable_precision == ngraph::element::i64) {
@ -70,7 +70,7 @@ void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr<ngraph::No
            DefaultFormatForDims(op->get_output_shape(1).size()),
            CldnnTensorFromIEDims(op->get_output_shape(1)));

-        shared_memory.emplace_back(cldnn::memory::allocate(p.GetEngine(), mutableLayout));
+        shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout));

        cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";
        auto ctc_gd_mutable_prim = cldnn::mutable_data(ctc_gd_mutable_id_w, shared_memory[0]);
--- a/inference-engine/src/cldnn_engine/ops/cum_sum.cpp
+++ b/inference-engine/src/cldnn_engine/ops/cum_sum.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/cum_sum.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/cum_sum.hpp"
+#include "cldnn/primitives/cum_sum.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/custom.cpp
+++ b/inference-engine/src/cldnn_engine/ops/custom.cpp
@ -9,8 +9,8 @@
 #include "ngraph/attribute_visitor.hpp"
 #include "ngraph/node.hpp"

-#include "api/custom_gpu_primitive.hpp"
-#include "api/reorder.hpp"
+#include "cldnn/primitives/custom_gpu_primitive.hpp"
+#include "cldnn/primitives/reorder.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp
+++ b/inference-engine/src/cldnn_engine/ops/depth_to_space.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/depth_to_space.hpp"

-#include "api/depth_to_space.hpp"
+#include "cldnn/primitives/depth_to_space.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/detection_output.cpp
+++ b/inference-engine/src/cldnn_engine/ops/detection_output.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/detection_output.hpp"

-#include "api/detection_output.hpp"
+#include "cldnn/primitives/detection_output.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/eltwise.cpp
+++ b/inference-engine/src/cldnn_engine/ops/eltwise.cpp
@ -25,10 +25,10 @@
 #include "ngraph/op/power.hpp"
 #include "ngraph/op/floor_mod.hpp"

-#include "api/activation.hpp"
-#include "api/eltwise.hpp"
-#include "api/reorder.hpp"
-#include "api/reshape.hpp"
+#include "cldnn/primitives/activation.hpp"
+#include "cldnn/primitives/eltwise.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/reshape.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp
+++ b/inference-engine/src/cldnn_engine/ops/embedding_bag.cpp
@ -9,8 +9,8 @@
 #include "ngraph/op/embeddingbag_offsets_sum.hpp"
 #include "ngraph/op/embeddingbag_packedsum.hpp"

-#include "api/embedding_bag.hpp"
-#include "api/reorder.hpp"
+#include "cldnn/primitives/embedding_bag.hpp"
+#include "cldnn/primitives/reorder.hpp"

 #include "transformations/utils/utils.hpp"

--- a/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp
+++ b/inference-engine/src/cldnn_engine/ops/extract_image_patches.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/extractimagepatches.hpp"

-#include "api/extract_image_patches.hpp"
+#include "cldnn/primitives/extract_image_patches.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp
+++ b/inference-engine/src/cldnn_engine/ops/fake_quantize.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/fake_quantize.hpp"

-#include "api/quantize.hpp"
+#include "cldnn/primitives/quantize.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/gather
+++ b/inference-engine/src/cldnn_engine/ops/gather
@ -7,8 +7,8 @@

 #include "ngraph/op/gather_tree.hpp"

-#include "api/gather_tree.hpp"
-#include "api/reorder.hpp"
+#include "cldnn/primitives/gather_tree.hpp"
+#include "cldnn/primitives/reorder.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/gather.cpp
+++ b/inference-engine/src/cldnn_engine/ops/gather.cpp
@ -7,8 +7,8 @@

 #include "ngraph/op/gather.hpp"

-#include "api/gather.hpp"
-#include "api/reorder.hpp"
+#include "cldnn/primitives/gather.hpp"
+#include "cldnn/primitives/reorder.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/gather_nd.cpp
+++ b/inference-engine/src/cldnn_engine/ops/gather_nd.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/gather_nd.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/gather_nd.hpp"
+#include "cldnn/primitives/gather_nd.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/grn.cpp
+++ b/inference-engine/src/cldnn_engine/ops/grn.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/grn.hpp"

-#include "api/grn.hpp"
+#include "cldnn/primitives/grn.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/interpolate.cpp
+++ b/inference-engine/src/cldnn_engine/ops/interpolate.cpp
@ -9,7 +9,7 @@
 #include "ngraph/op/interpolate.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/resample.hpp"
+#include "cldnn/primitives/resample.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/lrn.cpp
+++ b/inference-engine/src/cldnn_engine/ops/lrn.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/lrn.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/lrn.hpp"
+#include "cldnn/primitives/lrn.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/matmul.cpp
+++ b/inference-engine/src/cldnn_engine/ops/matmul.cpp
@ -9,11 +9,11 @@
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/fake_quantize.hpp"

-#include "api/gemm.hpp"
-#include "api/fully_connected.hpp"
-#include "api/reshape.hpp"
-#include "api/reorder.hpp"
-#include "api/permute.hpp"
+#include "cldnn/primitives/gemm.hpp"
+#include "cldnn/primitives/fully_connected.hpp"
+#include "cldnn/primitives/reshape.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/permute.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/mvn.cpp
+++ b/inference-engine/src/cldnn_engine/ops/mvn.cpp
@ -8,7 +8,8 @@
 #include "ngraph/op/mvn.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/mvn.hpp"
+#include "cldnn/primitives/mvn.hpp"
+
 #include <algorithm>

 namespace CLDNNPlugin {
--- a/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp
+++ b/inference-engine/src/cldnn_engine/ops/non_max_suppression.cpp
@ -9,9 +9,9 @@
 #include <ngraph/opsets/opset3.hpp>
 #include <ngraph_ops/nms_ie_internal.hpp>

-#include "api/reorder.hpp"
-#include "api/mutable_data.hpp"
-#include "api/non_max_suppression.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/mutable_data.hpp"
+#include "cldnn/primitives/non_max_suppression.hpp"

 namespace CLDNNPlugin {

@ -62,7 +62,7 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptr<ngrap

    std::size_t num_output = op->get_output_size();

-    std::vector<cldnn::memory> shared_memory;
+    std::vector<cldnn::memory::ptr> shared_memory;
    switch (num_output) {
        case 3: {
            auto mutable_precision_second = op->get_output_element_type(2);
@ -74,7 +74,7 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptr<ngrap
                DefaultFormatForDims(op->get_output_shape(2).size()),
                CldnnTensorFromIEDims(op->get_output_shape(2)));

-            shared_memory.emplace_back(cldnn::memory::allocate(p.GetEngine(), mutableLayoutSecond));
+            shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));

            cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
            auto nms_mutable_prim_second = cldnn::mutable_data(non_max_supression_mutable_id_w_second, shared_memory.back());
@ -91,7 +91,7 @@ void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_ptr<ngrap
                cldnn::format::bfyx,
                cldnn::tensor(outputIndices, 3, 1, 1));

-            shared_memory.emplace_back(cldnn::memory::allocate(p.GetEngine(), mutableLayoutFirst));
+            shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));

            cldnn::primitive_id non_max_supression_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";
            auto nms_mutable_prim_first = cldnn::mutable_data(non_max_supression_mutable_id_w_first, shared_memory.back());
--- a/inference-engine/src/cldnn_engine/ops/normalize_l2.cpp
+++ b/inference-engine/src/cldnn_engine/ops/normalize_l2.cpp
@ -8,8 +8,8 @@
 #include "ngraph/op/normalize_l2.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/normalize.hpp"
-#include "api/data.hpp"
+#include "cldnn/primitives/normalize.hpp"
+#include "cldnn/primitives/data.hpp"

 namespace CLDNNPlugin {

@ -35,8 +35,8 @@ void CreateNormalizeL2Op(Program& p, const std::shared_ptr<ngraph::op::v0::Norma
    // We create fake scale constant and fill it with ones to keep the same behavior as current primitive
    auto scale = std::make_shared<ngraph::op::v0::Constant>(op->get_output_element_type(0), ngraph::Shape{1}, std::vector<float>{1.0});
    cldnn::layout constLayout = cldnn::layout(DataTypeFromPrecision(op->get_output_element_type(0)), cldnn::format::bfyx, cldnn::tensor{1});
-    auto mem = cldnn::memory::allocate(p.GetEngine(), constLayout, 0, false);
-    auto tmpPointer = mem.pointer<char>();  // implicitly maps buffer - unmap in destructor
+    auto mem = p.GetEngine().allocate_memory(constLayout, false);
+    cldnn::mem_lock<int8_t> tmpPointer{mem, p.GetEngine().get_program_stream()};
    auto buf = tmpPointer.data();
    auto bufSize = scale->get_output_tensor(0).size();

--- a/inference-engine/src/cldnn_engine/ops/one_hot.cpp
+++ b/inference-engine/src/cldnn_engine/ops/one_hot.cpp
@ -8,7 +8,7 @@

 #include "ngraph/op/one_hot.hpp"

-#include "api/one_hot.hpp"
+#include "cldnn/primitives/one_hot.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/pad.cpp
+++ b/inference-engine/src/cldnn_engine/ops/pad.cpp
@ -8,7 +8,7 @@

 #include "ngraph/op/pad.hpp"

-#include "api/border.hpp"
+#include "cldnn/primitives/border.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/parameter.cpp
+++ b/inference-engine/src/cldnn_engine/ops/parameter.cpp
@ -7,10 +7,10 @@

 #include "ngraph/op/parameter.hpp"

-#include "api/input_layout.hpp"
-#include "api/reorder.hpp"
-#include "api/data.hpp"
-#include "api/concatenation.hpp"
+#include "cldnn/primitives/input_layout.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/data.hpp"
+#include "cldnn/primitives/concatenation.hpp"

 using namespace InferenceEngine;

@ -158,8 +158,8 @@ void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::Paramet
        if (bufIter != p.blobMemCache.end()) {
            meanBlobID = bufIter->second;
        } else {
-            auto mem = cldnn::memory::allocate(p.GetEngine(), meanBlobLayout, 0, false);
-            auto tmpPointer = mem.pointer<char>();  // implicitly maps buffer - unmap in destructor
+            auto mem = p.GetEngine().allocate_memory(meanBlobLayout, false);
+            cldnn::mem_lock<int8_t> tmpPointer{ mem, p.GetEngine().get_program_stream() };
            auto buf = tmpPointer.data();
            auto bufSize = meanBlobLayout.bytes_count();

--- a/inference-engine/src/cldnn_engine/ops/pooling.cpp
+++ b/inference-engine/src/cldnn_engine/ops/pooling.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/avg_pool.hpp"

-#include "api/pooling.hpp"
+#include "cldnn/primitives/pooling.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/prior_box.cpp
+++ b/inference-engine/src/cldnn_engine/ops/prior_box.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/prior_box.hpp"
 #include "ngraph/op/prior_box_clustered.hpp"

-#include "api/prior_box.hpp"
+#include "cldnn/primitives/prior_box.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/proposal.cpp
+++ b/inference-engine/src/cldnn_engine/ops/proposal.cpp
@ -7,8 +7,8 @@

 #include "ngraph/op/proposal.hpp"

-#include "api/proposal.hpp"
-#include "api/mutable_data.hpp"
+#include "cldnn/primitives/proposal.hpp"
+#include "cldnn/primitives/mutable_data.hpp"

 namespace CLDNNPlugin {

@ -62,7 +62,7 @@ void CreateProposalOp(Program& p, const std::shared_ptr<ngraph::op::v0::Proposal
                                                    DefaultFormatForDims(op->get_output_shape(1).size()),
                                                    CldnnTensorFromIEDims(op->get_output_shape(1)));

-        auto shared_memory = cldnn::memory::allocate(p.GetEngine(), mutableLayout);
+        auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);

        cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write";
        auto argmax_mutable_prim = cldnn::mutable_data(proposal_mutable_id_w, shared_memory);
--- a/inference-engine/src/cldnn_engine/ops/reduce.cpp
+++ b/inference-engine/src/cldnn_engine/ops/reduce.cpp
@ -16,9 +16,9 @@
 #include "ngraph/op/max.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/reduce.hpp"
-#include "api/reorder.hpp"
-#include "api/reshape.hpp"
+#include "cldnn/primitives/reduce.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/reshape.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/region_yolo.cpp
+++ b/inference-engine/src/cldnn_engine/ops/region_yolo.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/region_yolo.hpp"

-#include "api/region_yolo.hpp"
+#include "cldnn/primitives/region_yolo.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp
+++ b/inference-engine/src/cldnn_engine/ops/reorg_yolo.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/reorg_yolo.hpp"

-#include "api/reorg_yolo.hpp"
+#include "cldnn/primitives/reorg_yolo.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/reshape.cpp
+++ b/inference-engine/src/cldnn_engine/ops/reshape.cpp
@ -9,8 +9,8 @@
 #include "ngraph/op/squeeze.hpp"
 #include "ngraph/op/unsqueeze.hpp"

-#include "api/reshape.hpp"
-#include "api/reorder.hpp"
+#include "cldnn/primitives/reshape.hpp"
+#include "cldnn/primitives/reorder.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/result.cpp
+++ b/inference-engine/src/cldnn_engine/ops/result.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/result.hpp"

-#include "api/reorder.hpp"
+#include "cldnn/primitives/reorder.hpp"

 using namespace InferenceEngine;

--- a/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp
+++ b/inference-engine/src/cldnn_engine/ops/reverse_sequence.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/reverse_sequence.hpp"

-#include "api/reverse_sequence.hpp"
+#include "cldnn/primitives/reverse_sequence.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/rnn.cpp
+++ b/inference-engine/src/cldnn_engine/ops/rnn.cpp
@ -8,12 +8,12 @@
 #include "ngraph/op/lstm_cell.hpp"
 #include "ngraph/op/lstm_sequence.hpp"

-#include "api/reshape.hpp"
-#include "api/reorder.hpp"
-#include "api/fully_connected.hpp"
-#include "api/lstm.hpp"
-#include "api/crop.hpp"
-#include "api/concatenation.hpp"
+#include "cldnn/primitives/reshape.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/fully_connected.hpp"
+#include "cldnn/primitives/lstm.hpp"
+#include "cldnn/primitives/crop.hpp"
+#include "cldnn/primitives/concatenation.hpp"

 namespace CLDNNPlugin {
 cldnn::activation_func GetActivationFunc(std::string name) {
--- a/inference-engine/src/cldnn_engine/ops/roi_pooling.cpp
+++ b/inference-engine/src/cldnn_engine/ops/roi_pooling.cpp
@ -9,7 +9,7 @@
 #include "ngraph/op/psroi_pooling.hpp"
 #include "ngraph/op/deformable_psroi_pooling.hpp"

-#include "api/roi_pooling.hpp"
+#include "cldnn/primitives/roi_pooling.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/scatter_elements_update.cpp
+++ b/inference-engine/src/cldnn_engine/ops/scatter_elements_update.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/scatter_elements_update.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/scatter_elements_update.hpp"
+#include "cldnn/primitives/scatter_elements_update.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/scatter_nd_update.cpp
+++ b/inference-engine/src/cldnn_engine/ops/scatter_nd_update.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/scatter_nd_update.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/scatter_nd_update.hpp"
+#include "cldnn/primitives/scatter_nd_update.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/scatter_update.cpp
+++ b/inference-engine/src/cldnn_engine/ops/scatter_update.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/scatter_update.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/scatter_update.hpp"
+#include "cldnn/primitives/scatter_update.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/select.cpp
+++ b/inference-engine/src/cldnn_engine/ops/select.cpp
@ -7,9 +7,9 @@

 #include "ngraph/op/select.hpp"

-#include "api/select.hpp"
-#include "api/reorder.hpp"
-#include "api/reshape.hpp"
+#include "cldnn/primitives/select.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/primitives/reshape.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/shuffle_channels.cpp
+++ b/inference-engine/src/cldnn_engine/ops/shuffle_channels.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/shuffle_channels.hpp"

-#include "api/shuffle_channels.hpp"
+#include "cldnn/primitives/shuffle_channels.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/softmax.cpp
+++ b/inference-engine/src/cldnn_engine/ops/softmax.cpp
@ -8,8 +8,8 @@
 #include "ngraph/op/softmax.hpp"
 #include "ngraph/op/log_softmax.hpp"

-#include "api/softmax.hpp"
-#include "api/activation.hpp"
+#include "cldnn/primitives/softmax.hpp"
+#include "cldnn/primitives/activation.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/space_to_batch.cpp
+++ b/inference-engine/src/cldnn_engine/ops/space_to_batch.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/space_to_batch.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/space_to_batch.hpp"
+#include "cldnn/primitives/space_to_batch.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/space_to_depth.cpp
+++ b/inference-engine/src/cldnn_engine/ops/space_to_depth.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/space_to_depth.hpp"

-#include "api/space_to_depth.hpp"
+#include "cldnn/primitives/space_to_depth.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/split.cpp
+++ b/inference-engine/src/cldnn_engine/ops/split.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/split.hpp"
 #include "ngraph/op/variadic_split.hpp"

-#include "api/crop.hpp"
+#include "cldnn/primitives/crop.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/strided_slice.cpp
+++ b/inference-engine/src/cldnn_engine/ops/strided_slice.cpp
@ -8,9 +8,9 @@
 #include "ngraph/op/strided_slice.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/strided_slice.hpp"
-#include "api/reshape.hpp"
-#include "api/crop.hpp"
+#include "cldnn/primitives/strided_slice.hpp"
+#include "cldnn/primitives/reshape.hpp"
+#include "cldnn/primitives/crop.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/tensor_iterator.cpp
+++ b/inference-engine/src/cldnn_engine/ops/tensor_iterator.cpp
@ -13,11 +13,11 @@
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/util/sub_graph_base.hpp"

-#include "api/loop.hpp"
-#include "api/mutable_data.hpp"
-#include "api/data.hpp"
-#include "api/reorder.hpp"
-#include "api/topology.hpp"
+#include "cldnn/primitives/loop.hpp"
+#include "cldnn/primitives/mutable_data.hpp"
+#include "cldnn/primitives/data.hpp"
+#include "cldnn/primitives/reorder.hpp"
+#include "cldnn/graph/topology.hpp"

 #include <vector>
 #include <algorithm>
@ -28,9 +28,8 @@ namespace CLDNNPlugin {

 template<class DATA_TYPE>
 static DATA_TYPE CreateScalarData(Program &p, const cldnn::primitive_id& id, int64_t num) {
-    auto mem = cldnn::memory::allocate(p.GetEngine(),
-        { cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } });
-    auto ptr = mem.pointer<int64_t>();
+    auto mem = p.GetEngine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } });
+    cldnn::mem_lock<int64_t> ptr{mem, p.GetEngine().get_program_stream()};
    *ptr.begin() = num;
    return {id, mem};
 }
@ -42,7 +41,7 @@ static cldnn::mutable_data CreateAdditionalOutputData(Program &p, const std::sha
    const auto format = DefaultFormatForDims(op->get_output_shape(output_idx).size());
    const auto tensor = CldnnTensorFromIEDims(op->get_output_shape(output_idx));
    cldnn::layout output_layout = cldnn::layout(precision, format, tensor);
-    auto mem = cldnn::memory::allocate(p.GetEngine(), output_layout);
+    auto mem = p.GetEngine().allocate_memory(output_layout);
    auto md = cldnn::mutable_data(id, {input}, mem); // cldnn::data cannot set dependency
    return md;
 }
--- a/inference-engine/src/cldnn_engine/ops/tile.cpp
+++ b/inference-engine/src/cldnn_engine/ops/tile.cpp
@ -7,7 +7,7 @@

 #include "ngraph/op/tile.hpp"

-#include "api/tile.hpp"
+#include "cldnn/primitives/tile.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/topk.cpp
+++ b/inference-engine/src/cldnn_engine/ops/topk.cpp
@ -7,8 +7,8 @@

 #include "ngraph/op/topk.hpp"

-#include "api/arg_max_min.hpp"
-#include "api/mutable_data.hpp"
+#include "cldnn/primitives/arg_max_min.hpp"
+#include "cldnn/primitives/mutable_data.hpp"

 namespace CLDNNPlugin {

@ -71,7 +71,7 @@ void CreateTopKOp(Program& p, const std::shared_ptr<ngraph::op::v1::TopK>& op) {
                                                    DefaultFormatForDims(op->get_output_shape(1).size()),
                                                    CldnnTensorFromIEDims(op->get_output_shape(1)));

-        auto shared_memory = cldnn::memory::allocate(p.GetEngine(), mutableLayout);
+        auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);

        cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write";
        auto argmax_mutable_prim = cldnn::mutable_data(argmax_mutable_id_w, shared_memory);
--- a/inference-engine/src/cldnn_engine/ops/transpose.cpp
+++ b/inference-engine/src/cldnn_engine/ops/transpose.cpp
@ -8,7 +8,7 @@
 #include "ngraph/op/transpose.hpp"
 #include "ngraph/op/constant.hpp"

-#include "api/permute.hpp"
+#include "cldnn/primitives/permute.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/cldnn_engine/ops/unary.cpp
+++ b/inference-engine/src/cldnn_engine/ops/unary.cpp
@ -41,7 +41,7 @@
 #include "ngraph/op/hsigmoid.hpp"
 #include "ngraph/op/round.hpp"

-#include "api/activation.hpp"
+#include "cldnn/primitives/activation.hpp"

 namespace CLDNNPlugin {

--- a/inference-engine/src/gna_plugin/gna_groups.hpp
+++ b/inference-engine/src/gna_plugin/gna_groups.hpp
@ -52,13 +52,6 @@ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
    if (!GNAPluginNS::LayerInfo(layer).isSyntheticScaleShift())
        return false;

-    // Don't reshape the first dnn layer since it breaks groups recognition
-    auto prevLayer = InferenceEngine::CNNNetPrevLayerSkipCertain(layer, 0, [](InferenceEngine::CNNLayerPtr ptr) {
-        return LayerInfo(ptr).isNonValuesChangable();
-    });
-    IE_ASSERT(prevLayer != nullptr);
-    if (LayerInfo(prevLayer).isInput()) return false;
-
    // Don't reshape diagonallayers with bias connection
    return !GNAPluginNS::LayerInfo(getCreatorLayer(layer->insData.front().lock()).lock()).has32BOutput();
 }
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@ -85,8 +85,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
        return LayerInfo(ptr).isNonValuesChangable();
    });
    IE_ASSERT(inputLayer != nullptr);
-    size_t weightsSize = (LayerInfo(prevLayer).has32BOutput() || LayerInfo(inputLayer).isInput()) ?
-                         nextLayer->outData[0]->getDims().back() :
+    size_t weightsSize = LayerInfo(prevLayer).has32BOutput() ? nextLayer->outData[0]->getDims().back() :
        Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
    std::vector<float> weightsValues(weightsSize, fillValue);
    IE_ASSERT(diagLayer != nullptr);
--- a/inference-engine/src/inference_engine/compilation_context.cpp
+++ b/inference-engine/src/inference_engine/compilation_context.cpp
@ -42,7 +42,7 @@ static int32_t as_int32_t(T v) {
 }

 class OstreamHashWrapper final: public std::streambuf {
-    std::size_t    m_res = {};
+    std::size_t m_res = 0;
 public:
    std::size_t getResult() const { return m_res; }
    std::streamsize xsputn(const char* s, std::streamsize n) override {
@ -65,7 +65,7 @@ public:
 //////////////////////////////////////////////////

 std::string NetworkCompilationContext::calculateFileInfo(const std::string& filePath) {
-    size_t seed {};
+    size_t seed = 0;
    auto absPath = filePath;
    try {
        absPath = FileUtils::absoluteFilePath(filePath);
--- a/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
@ -270,6 +270,12 @@ template <typename T, typename... Args>
 std::shared_ptr<Node> fold_reshape(Args&&... args) {
    std::shared_ptr<Node> node = std::make_shared<T>(std::forward<Args>(args)...);
    if (node->get_output_size() == 1) {
+        // issue #57985: remove fold_reshape & reuse nGraph implementation
+        const auto values = as_type_ptr<opset1::Constant>(node->input_value(1).get_node_shared_ptr())->template cast_vector<int64_t>();
+        if (std::any_of(values.begin(), values.end(), [](const int64_t value) { return (value == 0) || (value == -1); })) {
+            return fold<opset1::Reshape>(std::forward<Args>(args)...);
+        }
+
        OutputVector folded;
        if (is_type<opset1::Constant>(node->input_value(0).get_node_shared_ptr()) &&
            is_type<opset1::Constant>(node->input_value(1).get_node_shared_ptr())) {
--- a/inference-engine/src/low_precision_transformations/src/network_helper.cpp
+++ b/inference-engine/src/low_precision_transformations/src/network_helper.cpp
@ -683,7 +683,7 @@ std::shared_ptr<Node> NetworkHelper::foldFakeQuantize(
        auto levels_1 = fq->get_levels() - 1.f;

        const size_t DHW = D * H * W;
-        const size_t IDHW = IC * D * H * W;
+        const size_t IDHW = outChannelsShapeIndex == 0 ? IC * D * H * W : OC * D * H * W;

        const auto values = constant->cast_vector<float>();
        std::vector<float> quantizedValues(OC * IC * D * H * W);
--- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp
+++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.cpp
@ -106,7 +106,6 @@ void jit_load_emitter::emit_isa(const Xbyak::Reg64 &reg_src, int offset_byte, In
                break;
            case Precision::I32:
                if ((src_prc == Precision::FP32) || (src_prc == Precision::BF16)) {
-                    h->uni_vroundps(Vmm(out_vec_idx), Vmm(out_vec_idx), 3); // rounding to zero
                    h->uni_vcvtps2dq(Vmm(out_vec_idx), Vmm(out_vec_idx));
                }
                break;
@ -511,6 +510,11 @@ size_t jit_store_emitter::aux_vecs_count() const {

 size_t jit_store_emitter::get_inputs_num() const { return 1; }

+void jit_store_emitter::emit_data() const {
+    if (emu_vcvtneps2bf16)
+        emu_vcvtneps2bf16->emit_data();
+}
+
 void jit_store_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs,
                  const emitter_context *emit_context) const {
@ -552,7 +556,6 @@ template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
            switch (src_prc) {
                case Precision::FP32:
                    if ((dst_prc != Precision::FP32) && (dst_prc != Precision::BF16)) {
-                        h->uni_vroundps(Vmm(in_vec_idx), Vmm(in_vec_idx), 3); // rounding to zero
                        h->uni_vcvtps2dq(Vmm(in_vec_idx), Vmm(in_vec_idx));
                    }
                    break;
--- a/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp
+++ b/inference-engine/src/mkldnn_plugin/emitters/jit_load_store_emitters.hpp
@ -18,8 +18,8 @@ struct load_emitter_context : public emitter_context {
    load_emitter_context() : src_prc_(Precision::FP32), dst_prc_(Precision::FP32), load_num_(8),
    offset_byte_(0), is_fill_(false), fill_value_("zero") {}

-    load_emitter_context(Precision src_prc, Precision dst_prc, int load_num, bool is_fill = false, std::string fill_value = "zero", int offset_byte = 0):
-    src_prc_(src_prc), dst_prc_(dst_prc), load_num_(load_num), is_fill_(is_fill), fill_value_(fill_value), offset_byte_(offset_byte) {}
+    load_emitter_context(Precision src_prc, Precision dst_prc, int load_num, int offset_byte = 0, bool is_fill = false, std::string fill_value = "zero"):
+    src_prc_(src_prc), dst_prc_(dst_prc), load_num_(load_num), offset_byte_(offset_byte), is_fill_(is_fill), fill_value_(fill_value) {}

    int offset_byte_;
    int load_num_;
@ -124,6 +124,8 @@ public:

    size_t get_inputs_num() const override;

+    void emit_data() const override;
+
    std::shared_ptr<jit_emu_vcvtneps2bf16> get_emu_vcvtneps2bf16() const {
        return emu_vcvtneps2bf16;
    }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
@ -306,7 +306,7 @@ private:
    inline void worker_tail_planar() {
        Precision dst_prc = isFloatCompatible(jcp_.src_prc) ? Precision::FP32 : Precision::I32;
        load_emitter->emit_code({static_cast<size_t>(reg_src.getIdx())}, {static_cast<size_t>(vmm_val.getIdx())},
-                            std::make_shared<load_emitter_context>(jcp_.src_prc, dst_prc, tail_num, true, "zero"),
+                                std::make_shared<load_emitter_context>(jcp_.src_prc, dst_prc, tail_num, 0, true),
                                {}, {load_pool_gpr_idxs});

        if (jcp_.normalize_variance) {
@ -477,8 +477,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
        this->postamble();

        load_emitter->emit_data();
-        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core) && store_emitter != nullptr && store_emitter->get_emu_vcvtneps2bf16() != nullptr)
-            store_emitter->get_emu_vcvtneps2bf16()->emit_data();
+        store_emitter->emit_data();

        for (auto& inj : eltwise_injectors)
            inj->prepare_table();
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
@ -88,8 +88,7 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi
        this->postamble();

        load_emitter->emit_data();
-        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core) && store_emitter != nullptr && store_emitter->get_emu_vcvtneps2bf16() != nullptr)
-            store_emitter->get_emu_vcvtneps2bf16()->emit_data();
+        store_emitter->emit_data();
    }

 private:
@ -155,7 +154,7 @@ private:
            Vmm vmm_max = get_acc_reg(i);

            load_emitter->emit_code({static_cast<size_t>(reg_input.getIdx())}, {static_cast<size_t>(vmm_max.getIdx())},
-                                    std::make_shared<load_emitter_context>(jpp_.src_prc, Precision::FP32, step, false, "zero", i * src_c_off),
+                                    std::make_shared<load_emitter_context>(jpp_.src_prc, Precision::FP32, step, i * src_c_off),
                                    {}, load_pool_gpr_idxs);
        }

@ -169,7 +168,7 @@ private:
                    Vmm vmm_src = get_src_reg(i);

                    load_emitter->emit_code({static_cast<size_t>(aux_reg_input1.getIdx())}, {static_cast<size_t>(vmm_src.getIdx())},
-                                            std::make_shared<load_emitter_context>(jpp_.src_prc, Precision::FP32, step, false, "zero", i * src_c_off),
+                                            std::make_shared<load_emitter_context>(jpp_.src_prc, Precision::FP32, step, i * src_c_off),
                                            {}, load_pool_gpr_idxs);

                    if (isa == cpu::x64::sse41) {
@ -222,7 +221,7 @@ private:

        for (int i = 0; i < c_blocks; i++) {
            const int src_c_off = i * jpp_.ih * jpp_.iw * jpp_.c_block * jpp_.src_data_size;
-            const auto load_context = std::make_shared<load_emitter_context>(jpp_.src_prc, Precision::FP32, step, false, "zero", src_c_off);
+            const auto load_context = std::make_shared<load_emitter_context>(jpp_.src_prc, Precision::FP32, step, src_c_off);

            mov(aux_reg_input, reg_input);

--- a/inference-engine/src/offline_transformations/src/moc_transformations.cpp
+++ b/inference-engine/src/offline_transformations/src/moc_transformations.cpp
@ -12,9 +12,5 @@
 NGRAPH_RTTI_DEFINITION(ngraph::pass::MOCTransformations, "MOCTransformations", 0);

 bool ngraph::pass::MOCTransformations::run_on_function(std::shared_ptr<ngraph::Function> f) {
-    ngraph::pass::Manager m(get_pass_config());
-    m.register_pass<Pruning>();
-    m.run_passes(f);
-
    return false;
 }
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.cpp
@ -90,21 +90,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[],
    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
 }

-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_nv12_to_rgb_impl(srcY, srcUV, dstRGBx, width);
-}
-
-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_i420_to_rgb_impl(srcY, srcU, srcV, dstRGBx, width);
-}
-
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
                    const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
                    int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@ -119,14 +104,6 @@ void calcRowArea_32F(float dst[], const float *src[], const Size& inSz,
    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }

-void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
-    copyRow_8U_impl(in, out, length);
-}
-
-void copyRow_32F(const float in[], float out[], int length) {
-    copyRow_32F_impl(in, out, length);
-}
-
 // Resize (bi-linear, 32F)
 void calcRowLinear_32F(float* dst[],
                       const float* src0[],
@ -708,6 +685,14 @@ void calcRowLinear_8UC1(uint8_t* dst[],
    }
 }
 }  // namespace neon
+
+template void chanToPlaneRowImpl(neon_tag, const uint8_t* in, int chan, int chs, uint8_t* out, const int length);
+template void chanToPlaneRowImpl(neon_tag, const float*   in, int chan, int chs, float  * out, const int length);
+
+template void nv12ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);
+
+template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
+++ b/inference-engine/src/preprocessing/arm_neon/ie_preprocess_gapi_kernels_neon.hpp
@ -167,26 +167,31 @@ void splitRow_32FC4(const float in[],
                          float out3[],
                            int length);

-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width);
-
 void calculate_i420_to_rgb(const  uchar **srcY,
                           const  uchar *srcU,
                           const  uchar *srcV,
                                  uchar **dstRGBx,
                                    int width);

-void copyRow_8U(const uint8_t in[],
-                uint8_t out[],
-                int length);
-
-void copyRow_32F(const float in[],
-                 float out[],
-                 int length);
-
 }  // namespace neon
+
+template<typename isa_tag_t, typename T>
+void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
+
+extern template void chanToPlaneRowImpl(neon_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
+extern template void chanToPlaneRowImpl(neon_tag, const float*   in, const int chan, const int chs, float  * out, const int length);
+
+template<typename isa_tag_t>
+void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);
+
+extern template void nv12ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);
+
+template<typename isa_tag_t>
+void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
+                             const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+extern template void i420ToRgbRowImpl(neon_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.cpp
@ -107,21 +107,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[],
    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
 }

-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_nv12_to_rgb_impl(srcY, srcUV, dstRGBx, width);
-}
-
-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_i420_to_rgb_impl(srcY, srcU, srcV, dstRGBx, width);
-}
-
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
                    const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
                    int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@ -555,13 +540,6 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }

-void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
-    copyRow_8U_impl(in, out, length);
-}
-void copyRow_32F(const float in[], float out[], int length) {
-    copyRow_32F_impl(in, out, length);
-}
-
 void calcRowLinear_32F(float *dst[],
                       const float *src0[],
                       const float *src1[],
@ -575,6 +553,15 @@ void calcRowLinear_32F(float *dst[],
 }

 }  // namespace avx
+
+template void chanToPlaneRowImpl(avx2_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
+template void chanToPlaneRowImpl(avx2_tag, const float*   in, const int chan, const int chs, float*   out, const int length);
+
+template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* uv_row,
+                               uint8_t** out_rows, const int buf_width);
+
+template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx2/ie_preprocess_gapi_kernels_avx2.hpp
@ -181,27 +181,29 @@ void splitRow_32FC4(const float in[],
                          float out2[],
                          float out3[],
                            int length);
-
-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width);
-
-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width);
-
-void copyRow_8U(const uint8_t in[],
-                uint8_t out[],
-                int length);
-
-void copyRow_32F(const float in[],
-                 float out[],
-                 int length);
-
 }  // namespace avx
+
+
+template<typename isa_tag_t, typename T>
+void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
+
+extern template void chanToPlaneRowImpl(avx2_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
+extern template void chanToPlaneRowImpl(avx2_tag, const float*   in, const int chan, const int chs, float  * out, const int length);
+
+template<typename isa_tag_t>
+void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row,
+                             uint8_t** out_rows, const int buf_width);
+
+extern template void nv12ToRgbRowImpl(avx2_tag, const uint8_t** y_rows,
+                                      const uint8_t* uv_row, uint8_t** out_rows,
+                                      const int buf_width);
+
+template<typename isa_tag_t>
+void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
+                             const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+extern template void i420ToRgbRowImpl(avx2_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.cpp
@ -101,21 +101,6 @@ void splitRow_32FC4(const float in[], float out0[], float out1[],
    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
 }

-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_nv12_to_rgb_impl(srcY, srcUV, dstRGBx, width);
-}
-
-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_i420_to_rgb_impl(srcY, srcU, srcV, dstRGBx, width);
-}
-
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz,
                    const Size& outSz, Q0_16 yalpha, const MapperUnit8U &ymap,
                    int xmaxdf, const short xindex[], const Q0_16 xalpha[],
@ -636,14 +621,6 @@ void calcRowLinear_8U(C4, std::array<std::array<uint8_t*, 4>, 4> &dst,
    calcRowLinear_8UC_Impl<chanNum>(dst, src0, src1, alpha, clone, mapsx, beta, tmp, inSz, outSz, lpi);
 }

-void copyRow_8U(const uint8_t in[], uint8_t out[], int length) {
-    copyRow_8U_impl(in, out, length);
-}
-
-void copyRow_32F(const float in[], float out[], int length) {
-    copyRow_32F_impl(in, out, length);
-}
-
 void calcRowLinear_32F(float *dst[],
                       const float *src0[],
                       const float *src1[],
@ -657,6 +634,14 @@ void calcRowLinear_32F(float *dst[],
 }

 }  // namespace avx512
+
+template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
+template void chanToPlaneRowImpl(avx512_tag, const float*   in, const int chan, const int chs, float*   out, const int length);
+
+template void nv12ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);
+
+template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_avx512/ie_preprocess_gapi_kernels_avx512.hpp
@ -180,27 +180,26 @@ void splitRow_32FC4(const float in[],
                          float out2[],
                          float out3[],
                            int length);
-
-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width);
-
-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width);
-
-void copyRow_8U(const uint8_t in[],
-                uint8_t out[],
-                int length);
-
-void copyRow_32F(const float in[],
-                 float out[],
-                 int length);
-
 }  // namespace avx512
+
+
+template<typename isa_tag_t, typename T>
+void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs, T* out, const int length);
+
+extern template void chanToPlaneRowImpl(avx512_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
+extern template void chanToPlaneRowImpl(avx512_tag, const float*   in, const int chan, const int chs, float*   out, const int length);
+
+template<typename isa_tag_t>
+void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);
+
+extern template void nv12ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);
+
+template<typename isa_tag_t>
+void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
+                             const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
+
+extern template void i420ToRgbRowImpl(avx512_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
@ -1365,33 +1365,13 @@ void splitRow_32FC4(const float in[],
    splitRow_32FC4_Impl(in, out0, out1, out2, out3, length);
 }

-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_nv12_to_rgb_impl(srcY, srcUV, dstRGBx, width);
-}
+template void chanToPlaneRowImpl(sse42_tag, const uint8_t* in, const int chan, const int chs, uint8_t* out, const int length);
+template void chanToPlaneRowImpl(sse42_tag, const float* in, const int chan, const int chs, float* out, const int length);

-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width) {
-    calculate_i420_to_rgb_impl(srcY, srcU, srcV, dstRGBx, width);
-}
-
-void copyRow_8U(const uint8_t in[],
-                 uint8_t out[],
-                 int length) {
-    copyRow_8U_impl(in, out, length);
-}
-
-void copyRow_32F(const float in[],
-                 float out[],
-                 int length) {
-    copyRow_32F_impl(in, out, length);
-}
+template void nv12ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);

+template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                               const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
+++ b/inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
@ -180,25 +180,25 @@ void splitRow_32FC4(const float in[],
                          float out3[],
                            int length);

-void calculate_nv12_to_rgb(const  uchar **srcY,
-                           const  uchar *srcUV,
-                                  uchar **dstRGBx,
-                                    int width);
+template<typename isa_tag_t, typename T>
+void chanToPlaneRowImpl(isa_tag_t, const T* in, const int chan, const int chs,
+                        T* out, const int length);

-void calculate_i420_to_rgb(const  uchar **srcY,
-                           const  uchar *srcU,
-                           const  uchar *srcV,
-                                  uchar **dstRGBx,
-                                    int width);
+extern template void chanToPlaneRowImpl(sse42_tag, const uint8_t* in, const int chan,
+                                        const int chs, uint8_t* out, const int length);
+extern template void chanToPlaneRowImpl(sse42_tag, const float*   in, const int chan,
+                                        const int chs, float*   out, const int length);
+template<typename isa_tag_t>
+void nv12ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);

-void copyRow_8U(const uint8_t in[],
-                uint8_t out[],
-                int length);
+extern template void nv12ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* uv_row, uint8_t** out_rows, const int buf_width);

-void copyRow_32F(const float in[],
-                 float out[],
-                 int length);
+template<typename isa_tag_t>
+void i420ToRgbRowImpl(isa_tag_t, const uint8_t** y_rows, const uint8_t* u_row,
+                             const uint8_t* v_row, uint8_t** out_rows, const int buf_width);

+extern template void i420ToRgbRowImpl(sse42_tag, const uint8_t** y_rows, const uint8_t* u_row,
+                                      const uint8_t* v_row, uint8_t** out_rows, const int buf_width);
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
@ -468,15 +468,86 @@ struct type_to_type {};
 template <typename typelist>
 struct type_dispatch_impl;

+//FIXME: add test for type_dispatch
 template <template<typename ...> class typelist, typename... type>
 struct type_dispatch_impl<typelist<type...>> {
    template <typename result_t, typename default_t, typename type_id_t, typename type_to_id_t, typename type_to_value_t>
    static result_t dispatch(type_id_t type_id, type_to_id_t&& type_to_id, type_to_value_t&& type_to_value, default_t default_value) {
        result_t res = default_value;

-        std::initializer_list<int> ({(type_id == type_to_id(type_to_type<type>{}) ? (res = type_to_value(type_to_type<type>{})), 0 : 0)...});
+        bool matched = false;
+        std::initializer_list<int> ({
+            !matched && (type_id == type_to_id(type_to_type<type>{})) ?
+                    (matched = true, res = type_to_value(type_to_type<type>{})), 0
+                    : 0
+            ...
+        });
        return res;
    }
+
+    template <typename result_t, typename default_t, typename pred_t, typename type_to_value_t>
+    static result_t dispatch(pred_t&& pred, type_to_value_t&& type_to_value, default_t default_value) {
+        result_t res = default_value;
+
+        bool matched = false;
+        std::initializer_list<int> ({
+            !matched && pred(type_to_type<type>{}) ?
+                    (matched = true, res = type_to_value(type_to_type<type>{})), 0
+                    : 0
+            ...
+        });
+        return res;
+    }
+};
+
+template<typename left_typelsist, typename right_typelsist>
+struct concat;
+
+template<typename left_typelsist, typename right_typelsist>
+using concat_t = typename concat<left_typelsist, right_typelsist>::type;
+
+template<template<typename ...> class left_list, typename ... left_types, template<typename ...> class right_list, typename ... right_types>
+struct concat<left_list<left_types...>, right_list<right_types...>>{
+    using type = left_list<left_types... , right_types...>;
+};
+
+template< class T, class U >
+using is_same_t = typename std::is_same<T, U>::type;
+
+template<bool C, class T, class E> struct if_c_impl;
+
+template<class T, class E> struct if_c_impl<true, T, E> {
+    using type = T;
+};
+
+template<class T, class E> struct if_c_impl<false, T, E> {
+    using type = E;
+};
+
+template<bool C, class T, class E>
+using if_c = typename if_c_impl<C, T, E>::type;
+
+template<class C, class T, class E>
+using if_ = typename if_c_impl<C::value != 0, T, E>::type;
+
+template<typename typelist, typename type>
+struct remove;
+
+template<typename typelist, typename type>
+using remove_t = typename remove<typelist, type>::type;
+
+
+template<template<typename ...> class list, typename head_t, typename ... types, typename t>
+struct remove<list<head_t, types...>, t> {
+    using type = concat_t<
+            if_<is_same_t<head_t, t>, list<>, list<head_t>>,
+            remove_t<list<types...>, t>
+            >;
+};
+
+template<template<typename ...> class list, typename t>
+struct remove<list<>, t> {
+    using type = list<>;
 };

 }  // namespace
@ -490,6 +561,13 @@ result_t type_dispatch(type_id_t type_id, type_to_id_t&& type_to_id, type_to_val
                                                                     std::forward<default_t>(default_value));
 }

+template <typename typelist, typename default_t, typename pred_t, typename type_to_value_t,
+          typename result_t = decltype(std::declval<type_to_value_t>()(type_to_type<head_t<typelist>> {}))>
+result_t type_dispatch(pred_t&& pred, type_to_value_t&& type_to_value, default_t default_value = {}) {
+    return type_dispatch_impl<typelist>::template dispatch<result_t>(std::forward<pred_t>(pred),
+                                                                     std::forward<type_to_value_t>(type_to_value),
+                                                                     std::forward<default_t>(default_value));
+}
 namespace {

 struct cv_type_id {
@ -668,81 +746,47 @@ GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
 };

 //----------------------------------------------------------------------
-
-template<typename T>
-static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, int length) {
-// AVX512 implementation of wide universal intrinsics is slower than AVX2.
-// It is turned off until the cause isn't found out.
-#if 0
+using isas_set = typelist<
 #ifdef HAVE_AVX512
-    if (with_cpu_x86_avx512f()) {
-        if (std::is_same<T, uint8_t>::value && chs == 1) {
-            avx512::copyRow_8U(in, out, length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 1) {
-            avx512::copyRow_32F(reinterpret_cast<const float*>(in),
-                                reinterpret_cast<float*>(out),
-                                length);
-            return;
-        }
-    }
-    #endif  // HAVE_AVX512
+        avx512_tag,
 #endif
+#ifdef HAVE_AVX2
+        avx2_tag,
+#endif
+#ifdef HAVE_SSE
+        sse42_tag,
+#endif
+#ifdef HAVE_NEON
+        neon_tag,
+#endif
+        //scalar "ISA" have to be the last one in the list,
+        //as the search for supported ISA is performed until first match
+        scalar_tag>;
+#ifdef HAVE_AVX512
+bool is_present(avx512_tag) { return with_cpu_x86_avx512f(); }
+#endif  // HAVE_AVX512

 #ifdef HAVE_AVX2
-    if (with_cpu_x86_avx2()) {
-        if (std::is_same<T, uint8_t>::value && chs == 1) {
-            avx::copyRow_8U(in, out, length);
-            return;
-        }
-
-        if (std::is_same<T, float>::value && chs == 1) {
-            avx::copyRow_32F(reinterpret_cast<const float*>(in),
-                             reinterpret_cast<float*>(out),
-                             length);
-            return;
-        }
-    }
+bool is_present(avx2_tag)   { return with_cpu_x86_avx2();    }
 #endif  // HAVE_AVX2
-    #ifdef HAVE_SSE
-    if (with_cpu_x86_sse42()) {
-        if (std::is_same<T, uint8_t>::value && chs == 1) {
-            copyRow_8U(in, out, length);
-            return;
-        }

-        if (std::is_same<T, float>::value && chs == 1) {
-            copyRow_32F(reinterpret_cast<const float*>(in),
-                        reinterpret_cast<float*>(out),
-                        length);
-            return;
-        }
-    }
+#ifdef HAVE_SSE
+bool is_present(sse42_tag)  { return with_cpu_x86_sse42();   }
 #endif  // HAVE_SSE

 #ifdef HAVE_NEON
-    if (std::is_same<T, uint8_t>::value && chs == 1) {
-        neon::copyRow_8U(in, out, length);
-        return;
-    }
-
-    if (std::is_same<T, float>::value && chs == 1) {
-        neon::copyRow_32F(reinterpret_cast<const float*>(in),
-                          reinterpret_cast<float*>(out),
-                          length);
-        return;
-    }
+bool is_present(neon_tag)   { return true; }
 #endif  // HAVE_NEON

-    const auto inT  = reinterpret_cast<const T*>(in);
-          auto outT = reinterpret_cast<      T*>(out);
+//scalar version of kernels is always available
+bool is_present(scalar_tag) { return true; }

-    for (int x = 0; x < length; x++) {
-        outT[x] = inT[x*chs + chan];
-    }
+struct is_isa_present {
+    template< typename isa_tag_t>
+    bool operator()(type_to_type<isa_tag_t>) {
+        return is_present(isa_tag_t{});
    }
+};

 //    GAPI_OCV_KERNEL(OCVChanToPlane, ChanToPlane) {
 //        static void run(const cv::Mat &in, int chan, cv::Mat &out) {
@ -774,15 +818,225 @@ static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, i
 //        }
 //    };

+namespace {
+
+using chan_to_plane_supported_types = typelist<uint8_t, float>;
+
+template<typename T>
+void chanToPlaneRowImpl(scalar_tag, const T* in, int chan, int chs, T* out, int length) {
+    for (int x = 0; x < length; x++) {
+        out[x] = in[x*chs + chan];
+    }
+}
+
+template<typename isa_tag_t>
+struct typed_chan_to_plane_row {
+    using p_f = void (*)(const uint8_t* in, int chan, int chs, uint8_t* out, int length);
+
+    template <typename type>
+    p_f operator()(type_to_type<type> ) {
+        return [](const uint8_t* in, int chan, int chs, uint8_t* out, int length){
+            const auto inT  = reinterpret_cast<const type*>(in);
+                  auto outT = reinterpret_cast<      type*>(out);
+
+            chanToPlaneRowImpl(isa_tag_t{}, inT, chan, chs, outT, length);
+        };
+    }
+};
+} //namespace
+
+namespace {
+
+using nv12_to_rgb_supported_types = typelist<uint8_t>;
+
+void nv12ToRgbRowImpl(scalar_tag, const uint8_t** y_rows, const uint8_t* uv_row,
+    uint8_t** out_rows, const int buf_width) {
+    for (int i = 0; i < buf_width; i += 2) {
+        uint8_t u = uv_row[i];
+        uint8_t v = uv_row[i + 1];
+        int ruv, guv, buv;
+        uvToRGBuv(u, v, ruv, guv, buv);
+
+        for (int y = 0; y < 2; y++) {
+            for (int x = 0; x < 2; x++) {
+                uint8_t vy = y_rows[y][i + x];
+                uint8_t r, g, b;
+                yRGBuvToRGB(vy, ruv, guv, buv, r, g, b);
+
+                out_rows[y][3 * (i + x)] = r;
+                out_rows[y][3 * (i + x) + 1] = g;
+                out_rows[y][3 * (i + x) + 2] = b;
+            }
+        }
+    }
+}
+
+template<typename isa_tag_t>
+struct typed_nv12_to_rgb_row {
+    using p_f = void (*)(const uint8_t** y_rows, const uint8_t* uv_row,
+        uint8_t** out_rows, const int buf_width);
+
+    template <typename type>
+    p_f operator()(type_to_type<type>) {
+        return [](const uint8_t** y_rows, const uint8_t* uv_row,
+            uint8_t** out_rows, const int buf_width) {
+                const auto inT1 = reinterpret_cast<const type**>(y_rows);
+                const auto inT2 = reinterpret_cast<const type*>(uv_row);
+                auto outT = reinterpret_cast<type**>(out_rows);
+
+                nv12ToRgbRowImpl(isa_tag_t{}, inT1, inT2, outT, buf_width);
+        };
+    }
+};
+}  // namespace
+
+namespace {
+
+using i420_to_rgb_supported_types = typelist<uint8_t>;
+
+static void i420ToRgbRowImpl(scalar_tag, const  uint8_t** y_rows,
+    const  uint8_t* u_row,
+    const  uint8_t* v_row,
+    uint8_t** out_rows,
+    const int buf_width) {
+    for (int i = 0; i < buf_width; i += 2) {
+        uchar u = u_row[i / 2];
+        uchar v = v_row[i / 2];
+        int ruv, guv, buv;
+        uvToRGBuv(u, v, ruv, guv, buv);
+
+        for (int y = 0; y < 2; y++) {
+            for (int x = 0; x < 2; x++) {
+                uchar vy = y_rows[y][i + x];
+                uchar r, g, b;
+                yRGBuvToRGB(vy, ruv, guv, buv, r, g, b);
+
+                out_rows[y][3 * (i + x)] = r;
+                out_rows[y][3 * (i + x) + 1] = g;
+                out_rows[y][3 * (i + x) + 2] = b;
+            }
+        }
+    }
+}
+
+template<typename isa_tag_t>
+struct typed_i420_to_rgb_row {
+    using p_f = void (*)(const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row,
+                         uint8_t** out_rows, const int buf_width);
+
+    template <typename type>
+    p_f operator()(type_to_type<type>) {
+        return [](const uint8_t** y_rows, const uint8_t* u_row, const uint8_t* v_row,
+                  uint8_t** out_rows, const int buf_width) {
+                const auto inT1 = reinterpret_cast<const type**>(y_rows);
+                const auto inT2 = reinterpret_cast<const type*>(u_row);
+                const auto inT3 = reinterpret_cast<const type*>(v_row);
+                auto outT = reinterpret_cast<type**>(out_rows);
+
+                i420ToRgbRowImpl(isa_tag_t{}, inT1, inT2, inT3, outT, buf_width);
+        };
+    }
+};
+}  // namespace
+
+template <typename isa_tag_t>
+struct choose_impl {
 GAPI_FLUID_KERNEL(FChanToPlane, ChanToPlane, false) {
    static const int Window = 1;
    static void run(const cv::gapi::fluid::View& in, int chan,
                    cv::gapi::fluid::Buffer& out) {
-        const auto rowFunc = (in.meta().depth == CV_8U) ? &chanToPlaneRow<uint8_t> : &chanToPlaneRow<float>;
+        GAPI_DbgAssert(is_cv_type_in_list<chan_to_plane_supported_types>(out.meta().depth));
+
+        const auto rowFunc = type_dispatch<chan_to_plane_supported_types>(out.meta().depth, cv_type_id{}, typed_chan_to_plane_row<isa_tag_t>{}, nullptr);
+
+        GAPI_DbgAssert(rowFunc);
+
        rowFunc(in.InLineB(0), chan, in.meta().chan, out.OutLineB(), in.length());
    }
 };

+GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
+    static const int Window = 1;
+    static const int LPI = 2;
+    static const auto Kind = cv::GFluidKernel::Kind::YUV420toRGB;
+
+    static void run(const cv::gapi::fluid::View & in_y,
+                    const cv::gapi::fluid::View & in_uv,
+                    cv::gapi::fluid::Buffer & out) {
+        GAPI_DbgAssert(is_cv_type_in_list<nv12_to_rgb_supported_types>(out.meta().depth));
+
+        const uchar* uv_row = in_uv.InLineB(0);
+        const uchar* y_rows[2] = { in_y.InLineB(0), in_y.InLineB(1) };
+        uchar* out_rows[2] = { out.OutLineB(0), out.OutLineB(1) };
+
+        int buf_width = out.length();
+
+        const auto rowFunc = type_dispatch<nv12_to_rgb_supported_types>(out.meta().depth, cv_type_id{}, typed_nv12_to_rgb_row<isa_tag_t>{}, nullptr);
+
+        GAPI_DbgAssert(rowFunc);
+
+        rowFunc(y_rows, uv_row, out_rows, buf_width);
+    }
+};
+
+GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
+    static const int Window = 1;
+    static const int LPI = 2;
+    static const auto Kind = cv::GFluidKernel::Kind::YUV420toRGB;
+
+    static void run(const cv::gapi::fluid::View & in_y,
+        const cv::gapi::fluid::View & in_u,
+        const cv::gapi::fluid::View & in_v,
+        cv::gapi::fluid::Buffer & out) {
+        GAPI_DbgAssert(is_cv_type_in_list<i420_to_rgb_supported_types>(out.meta().depth));
+
+        const uchar* u_row = in_u.InLineB(0);
+        const uchar* v_row = in_v.InLineB(0);
+        const uchar* y_rows[2] = { in_y.InLineB(0), in_y.InLineB(1) };
+        uchar* out_rows[2] = { out.OutLineB(0), out.OutLineB(1) };
+
+        int buf_width = out.length();
+        GAPI_DbgAssert(in_u.length() == in_v.length());
+
+        const auto rowFunc = type_dispatch<i420_to_rgb_supported_types>(out.meta().depth, cv_type_id{}, typed_i420_to_rgb_row<isa_tag_t>{}, nullptr);
+
+        GAPI_DbgAssert(rowFunc);
+
+        rowFunc(y_rows, u_row, v_row, out_rows, buf_width);
+    }
+};
+};
+
+namespace {
+struct ColorConversionISA {
+    cv::gapi::GKernelPackage& pckg;
+
+    ColorConversionISA(cv::gapi::GKernelPackage& _pckg) : pckg(_pckg) {}
+
+    template<typename isa_tag_t>
+    bool operator()(type_to_type<isa_tag_t>) {
+        pckg.include<typename choose_impl<isa_tag_t>::FI420toRGB>();
+        pckg.include<typename choose_impl<isa_tag_t>::FNV12toRGB>();
+        pckg.include<typename choose_impl<isa_tag_t>::FChanToPlane>();
+        //at the moment type_dispatch requires something to be returned by the lambda
+        return true;
+    }
+};
+}  //namespace
+
+cv::gapi::GKernelPackage FColorConversionChooseISA() {
+    // At the moment AVX512 implementation of wide universal intrinsics is slower than AVX2.
+    // So, disable it for now.
+    using isas = remove_t<isas_set, avx512_tag>;
+
+    cv::gapi::GKernelPackage pckg;
+    ColorConversionISA ctpISA{pckg};
+
+    type_dispatch<isas>(is_isa_present{}, ctpISA, false);
+
+    return pckg;
+}
+
 //----------------------------------------------------------------------

 G_TYPED_KERNEL(ScalePlane8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_8u") {
@ -2234,180 +2488,6 @@ GAPI_FLUID_KERNEL(FScalePlaneArea8u, ScalePlaneArea8u, true) {
    }
 };

-static const int ITUR_BT_601_CY = 1220542;
-static const int ITUR_BT_601_CUB = 2116026;
-static const int ITUR_BT_601_CUG = -409993;
-static const int ITUR_BT_601_CVG = -852492;
-static const int ITUR_BT_601_CVR = 1673527;
-static const int ITUR_BT_601_SHIFT = 20;
-
-static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
-    int uu, vv;
-    uu = static_cast<int>(u) - 128;
-    vv = static_cast<int>(v) - 128;
-
-    ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv;
-    guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu;
-    buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
-}
-
-static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
-                                uchar& r, uchar& g, uchar& b) {
-    int yy = static_cast<int>(vy);
-    int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
-    r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
-    g = saturate_cast<uchar>((y + guv) >> ITUR_BT_601_SHIFT);
-    b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
-}
-
-static void calculate_nv12_to_rgb_fallback(const  uchar **y_rows,
-                                           const  uchar *uv_row,
-                                                  uchar **out_rows,
-                                           int buf_width) {
-    for (int i = 0; i < buf_width; i += 2) {
-        uchar u = uv_row[i];
-        uchar v = uv_row[i + 1];
-        int ruv, guv, buv;
-        uvToRGBuv(u, v, ruv, guv, buv);
-
-        for (int y = 0; y < 2; y++) {
-            for (int x = 0; x < 2; x++) {
-                uchar vy = y_rows[y][i + x];
-                uchar r, g, b;
-                yRGBuvToRGB(vy, ruv, guv, buv, r, g, b);
-
-                out_rows[y][3*(i + x)]     = r;
-                out_rows[y][3*(i + x) + 1] = g;
-                out_rows[y][3*(i + x) + 2] = b;
-            }
-        }
-    }
-}
-
-static void calculate_i420_to_rgb_fallback(const  uchar **y_rows,
-                                           const  uchar *u_row,
-                                           const  uchar *v_row,
-                                                  uchar **out_rows,
-                                           int buf_width) {
-    for (int i = 0; i < buf_width; i += 2) {
-        uchar u = u_row[i / 2];
-        uchar v = v_row[i / 2];
-        int ruv, guv, buv;
-        uvToRGBuv(u, v, ruv, guv, buv);
-
-        for (int y = 0; y < 2; y++) {
-            for (int x = 0; x < 2; x++) {
-                uchar vy = y_rows[y][i + x];
-                uchar r, g, b;
-                yRGBuvToRGB(vy, ruv, guv, buv, r, g, b);
-
-                out_rows[y][3*(i + x)]     = r;
-                out_rows[y][3*(i + x) + 1] = g;
-                out_rows[y][3*(i + x) + 2] = b;
-            }
-        }
-    }
-}
-
-GAPI_FLUID_KERNEL(FNV12toRGB, NV12toRGB, false) {
-    static const int Window = 1;
-    static const int LPI    = 2;
-    static const auto Kind = cv::GFluidKernel::Kind::YUV420toRGB;
-
-    static void run(const cv::gapi::fluid::View &in_y,
-                    const cv::gapi::fluid::View &in_uv,
-                          cv::gapi::fluid::Buffer &out) {
-        const uchar* uv_row = in_uv.InLineB(0);
-        const uchar* y_rows[2] = {in_y. InLineB(0), in_y. InLineB(1)};
-        uchar* out_rows[2] = {out.OutLineB(0), out.OutLineB(1)};
-
-        int buf_width = out.length();
-
-// AVX512 implementation of wide universal intrinsics is slower than AVX2.
-// It is turned off until the cause isn't found out.
-    #if 0
-    #ifdef HAVE_AVX512
-        if (with_cpu_x86_avx512_core()) {
-            #define CV_AVX_512DQ 1
-            avx512::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
-            return;
-        }
-    #endif  // HAVE_AVX512
-    #endif
-
-    #ifdef HAVE_AVX2
-        if (with_cpu_x86_avx2()) {
-            avx::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
-            return;
-        }
-    #endif  // HAVE_AVX2
-    #ifdef HAVE_SSE
-        if (with_cpu_x86_sse42()) {
-            calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
-            return;
-        }
-    #endif  // HAVE_SSE
-
-    #ifdef HAVE_NEON
-        neon::calculate_nv12_to_rgb(y_rows, uv_row, out_rows, buf_width);
-        return;
-    #endif  // HAVE_NEON
-
-        calculate_nv12_to_rgb_fallback(y_rows, uv_row, out_rows, buf_width);
-    }
-};
-
-GAPI_FLUID_KERNEL(FI420toRGB, I420toRGB, false) {
-    static const int Window = 1;
-    static const int LPI    = 2;
-    static const auto Kind = cv::GFluidKernel::Kind::YUV420toRGB;
-
-    static void run(const cv::gapi::fluid::View &in_y,
-                    const cv::gapi::fluid::View &in_u,
-                    const cv::gapi::fluid::View &in_v,
-                          cv::gapi::fluid::Buffer &out) {
-        const uchar* u_row = in_u.InLineB(0);
-        const uchar* v_row = in_v.InLineB(0);
-        const uchar* y_rows[2] = {in_y. InLineB(0), in_y. InLineB(1)};
-        uchar* out_rows[2] = {out.OutLineB(0), out.OutLineB(1)};
-
-        int buf_width = out.length();
-        GAPI_DbgAssert(in_u.length() ==  in_v.length());
-
-        // AVX512 implementation of wide universal intrinsics is slower than AVX2.
-        // It is turned off until the cause isn't found out.
-        #if 0
-        #ifdef HAVE_AVX512
-            if (with_cpu_x86_avx512_core()) {
-               #define CV_AVX_512DQ 1
-               avx512::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-               return;
-            }
-        #endif  // HAVE_AVX512
-        #endif
-
-        #ifdef HAVE_AVX2
-            if (with_cpu_x86_avx2()) {
-               avx::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-               return;
-            }
-        #endif  // HAVE_AVX2
-        #ifdef HAVE_SSE
-            if (with_cpu_x86_sse42()) {
-               calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-               return;
-            }
-        #endif  // HAVE_SSE
-
-        #ifdef HAVE_NEON
-            neon::calculate_i420_to_rgb(y_rows, u_row, v_row, out_rows, buf_width);
-            return;
-        #endif  // HAVE_NEON
-
-        calculate_i420_to_rgb_fallback(y_rows, u_row, v_row, out_rows, buf_width);
-    }
-};
-
 namespace {

 template <typename src_t, typename dst_t>
@ -2520,9 +2600,10 @@ GAPI_FLUID_KERNEL(FDivC, GDivC, false) {
 using namespace kernels;

 cv::gapi::GKernelPackage preprocKernels() {
-    return cv::gapi::kernels
-        < FChanToPlane
-        , FScalePlanes
+    return combine(
+        FColorConversionChooseISA(),
+        cv::gapi::kernels
+        <FScalePlanes
        , FScalePlanes4
        , FScalePlane
        , FScalePlane32f
@ -2537,12 +2618,10 @@ cv::gapi::GKernelPackage preprocKernels() {
        , FSplit2
        , FSplit3
        , FSplit4
-        , FNV12toRGB
-        , FI420toRGB
        , FConvertDepth
        , FSubC
        , FDivC
-        >();
+        >());
 }

 }  // namespace gapi
--- a/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_impl.hpp
+++ b/inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_impl.hpp
@ -34,6 +34,12 @@ namespace InferenceEngine {
 namespace gapi {
 namespace kernels {

+struct avx512_tag {};
+struct avx2_tag {};
+struct sse42_tag {};
+struct neon_tag {};
+struct scalar_tag {};
+
 template<typename DST, typename SRC> static inline DST saturate_cast(SRC x);
 template<> inline short saturate_cast(int x) { return (std::min)(SHRT_MAX, (std::max)(SHRT_MIN, x)); }
 template<> inline short saturate_cast(float x) { return saturate_cast<short>(static_cast<int>(std::rint(x))); }
@ -116,6 +122,31 @@ static inline Q8_8 mulaw(Q0_16 a, Q8_8 w) { return static_cast<Q8_8>((a * w) >>
 static inline float mulas(float a, float s) { return a * s; }
 static inline float mulaw(float a, float w) { return a * w; }

+static const int ITUR_BT_601_CY = 1220542;
+static const int ITUR_BT_601_CUB = 2116026;
+static const int ITUR_BT_601_CUG = -409993;
+static const int ITUR_BT_601_CVG = -852492;
+static const int ITUR_BT_601_CVR = 1673527;
+static const int ITUR_BT_601_SHIFT = 20;
+
+static inline void uvToRGBuv(const uchar u, const uchar v, int& ruv, int& guv, int& buv) {
+    int uu, vv;
+    uu = static_cast<int>(u) - 128;
+    vv = static_cast<int>(v) - 128;
+
+    ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * vv;
+    guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * vv + ITUR_BT_601_CUG * uu;
+    buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * uu;
+}
+
+static inline void yRGBuvToRGB(const uchar vy, const int ruv, const int guv, const int buv,
+    uchar& r, uchar& g, uchar& b) {
+    int yy = static_cast<int>(vy);
+    int y = std::max(0, yy - 16) * ITUR_BT_601_CY;
+    r = saturate_cast<uchar>((y + ruv) >> ITUR_BT_601_SHIFT);
+    g = saturate_cast<uchar>((y + guv) >> ITUR_BT_601_SHIFT);
+    b = saturate_cast<uchar>((y + buv) >> ITUR_BT_601_SHIFT);
+}
 }  // namespace kernels
 }  // namespace gapi
 }  // namespace InferenceEngine
--- a/Show More
+++ b/Show More