[README.md] change latest release to 2021.4.2

Feature/azaytsev/doc updates gna 2021 4 2 (#8567 )
* Various doc changes * Reformatted C++/Pythob sections. Updated with info from PR8490 * additional fix * Gemini Lake replaced with Elkhart Lake * Fixed links in IGs, Added 12th Gen
2021-11-16 20:55:28 +03:00 · 2021-11-16 12:36:17 +00:00 · 2021-11-13 00:17:59 +03:00 · 2021-11-12 02:31:47 +03:00 · 2021-11-10 14:13:13 +03:00 · 2021-11-10 10:31:58 +03:00
1033 changed files with 37336 additions and 18604 deletions
--- a/.ci/azure/linux.yml
+++ b/.ci/azure/linux.yml
@@ -1,3 +1,12 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 resources:
  repositories:
  - repository: openvino_contrib
--- a/.ci/azure/linux_conditional_compilation.yml
+++ b/.ci/azure/linux_conditional_compilation.yml
@@ -1,3 +1,12 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 jobs:
 - job: LinCC
  # About 150% of total time
--- a/.ci/azure/linux_ngraph_onnx.yml
+++ b/.ci/azure/linux_ngraph_onnx.yml
@@ -1,22 +1,42 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 jobs:
- job: nGraph_ONNX_Lin
+- job: OpenVINO_ONNX_CI
+  strategy:
+    matrix:
+      Release:
+        BUILD_TYPE: 'Release'
+        PROTOBUF_LITE: 'ON'
+        TOX_COMMAND: 'tox && tox -e zoo_models'
+      Debug:
+        BUILD_TYPE: 'Debug'
+        PROTOBUF_LITE: 'ON'
+        TOX_COMMAND: 'tox'
+    maxParallel: 2

  # About 300% of total time
  timeoutInMinutes: 90

  pool:
-    name: LIN_VMSS_VENV_ONNX_WU2
+    name: LIN_VMSS_VENV_ONNX_U20_WU2

  variables:
    system.debug: true
    VSTS_HTTP_RETRY: 5
    VSTS_HTTP_TIMEOUT: 200
-    WORKERS_NUMBER: 8
-    BUILD_TYPE: Release
    REPO_DIR: $(Build.Repository.LocalPath)
    WORK_DIR: $(Pipeline.Workspace)/_w
    MODELS_DIR: /mount/cinfsshare/onnxtestdata
    TMP_DIR: /mnt/tmp
+    ONNX_MODEL_ZOO_SHA: "d58213534f2a4d1c4b19ba62b3bb5f544353256e"
+

  steps:
  - script: |
@@ -27,6 +47,7 @@ jobs:
      echo Python info ; which python ; python --version
      echo Java info ; which java ; java -version
      echo gcc info ; which gcc ; gcc --version
+      echo cmake info ; which cmake ; cmake --version
      lsb_release
      env
      cat /proc/cpuinfo
@@ -40,10 +61,10 @@ jobs:

  - script: |
      rm -rf $(WORK_DIR) ; mkdir $(WORK_DIR)
-      sudo rm -rf $(TMP_DIR) ; sudo mkdir $(TMP_DIR) ; sudo chmod 777 -R $(TMP_DIR)
      sudo mkdir -p $(MODELS_DIR)
      sudo apt --assume-yes install nfs-common
      sudo mount -vvv -t nfs cinfsshare.file.core.windows.net:/cinfsshare/onnxtestdata $(MODELS_DIR) -o vers=4,minorversion=1,sec=sys
+      mkdir -p $(MODELS_DIR)/models_data
    displayName: 'Make dirs'

  - checkout: self
@@ -52,31 +73,23 @@ jobs:
    submodules: recursive
    path: openvino

-  - script: docker build --tag=openvino-onnx-ci-image --file=.ci/openvino-onnx/Dockerfile .
-    displayName: 'Docker build'
-
-  - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(TMP_DIR) -o
-    displayName: 'Get models'
-
  - script: |
-      ##wget -O "$(TMP_DIR)/msft.zip" https://onnxruntimetestdata.blob.core.windows.net/models/20191107.zip
-      ##unzip "$(TMP_DIR)/msft.zip" -d "$(MODELS_DIR)/msft"
-      #unzip "/mnt/onnxtestdata/models/20191107.zip" -d "$(MODELS_DIR)/msft"
-      #mv $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/seq_lens_sorted $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/test_data_set_0
-      #mv $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/seq_lens_unsorted $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/test_data_set_1
-    displayName: 'Get MSFT models'
-    enabled: false
+      set -e
+      sudo apt --assume-yes install git-lfs uidmap
+      curl -fsSL https://get.docker.com -o get-docker.sh
+      sudo sh get-docker.sh
+    workingDirectory: $(WORK_DIR)
+    displayName: 'Install dependencies'

-  - script: |
-      ls -alR $(MODELS_DIR)
-      ls -alR $(TMP_DIR)
-    displayName: 'List models'
-    enabled: false
+  - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(MODELS_DIR)/models_data -o -s "$(ONNX_MODEL_ZOO_SHA)"
+    displayName: 'Update models'
+    condition: ne(variables['BUILD_TYPE'], 'Debug')

-  - script: sudo fallocate -l 48G /swapfile ; sudo mkswap /swapfile ; sudo swapon /swapfile ; df ; free -h
+  - script: sudo docker build --tag=openvino-onnx-ci-image --file=.ci/openvino-onnx/Dockerfile --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg PROTOBUF_LITE=$(PROTOBUF_LITE) .
+    displayName: 'Docker build $(BUILD_TYPE) protobuf-lite: $(PROTOBUF_LITE)'
+
+  - script: sudo fallocate -l 64G /swapfile ; sudo mkswap /swapfile ; sudo swapon /swapfile ; df ; free -h
    displayName: 'Create swap'

-  - script: |
-      docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo:/root/.onnx/model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image
-    displayName: 'Docker run'
-
+  - script: sudo docker run --name openvino-onnx-ci-container --volume $(MODELS_DIR)/models_data/model_zoo/onnx_model_zoo_$(ONNX_MODEL_ZOO_SHA):/root/.onnx/model_zoo/onnx_model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image /bin/bash -c "$(TOX_COMMAND)"
+    displayName: 'Docker run $(BUILD_TYPE) protobuf-lite: $(PROTOBUF_LITE)'
--- a/.ci/azure/linux_onnxruntime.yml
+++ b/.ci/azure/linux_onnxruntime.yml
@@ -1,3 +1,12 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 jobs:
 - job: onnxruntime
  timeoutInMinutes: 90
@@ -9,7 +18,6 @@ jobs:
    system.debug: true
    VSTS_HTTP_RETRY: 5
    VSTS_HTTP_TIMEOUT: 200
-    WORKERS_NUMBER: 8
    BUILD_TYPE: Release
    REPO_DIR: $(Build.Repository.LocalPath)
    ONNXRUNTIME_REPO_DIR: $(REPO_DIR)/../onnxruntime
--- a/.ci/azure/mac.yml
+++ b/.ci/azure/mac.yml
@@ -1,3 +1,12 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 resources:
  repositories:
  - repository: openvino_contrib
@@ -22,7 +31,6 @@ jobs:
    system.debug: true
    VSTS_HTTP_RETRY: 5
    VSTS_HTTP_TIMEOUT: 200
-    WORKERS_NUMBER: 3
    BUILD_TYPE: Release
    REPO_DIR: $(Build.Repository.LocalPath)
    OPENVINO_CONTRIB_REPO_DIR: $(REPO_DIR)/../openvino_contrib
--- a/.ci/azure/windows.yml
+++ b/.ci/azure/windows.yml
@@ -1,3 +1,12 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 resources:
  repositories:
  - repository: openvino_contrib
@@ -16,13 +25,12 @@ jobs:
  timeoutInMinutes: 120

  pool:
-    name: WIN_VMSS_VENV_F8S_WU2
+    name: WIN_VMSS_VENV_F16S_WU2

  variables:
    system.debug: true
    VSTS_HTTP_RETRY: 5
    VSTS_HTTP_TIMEOUT: 200
-    WORKERS_NUMBER: 8
    BUILD_TYPE: Release
    REPO_DIR: $(Build.Repository.LocalPath)
    OPENVINO_CONTRIB_REPO_DIR: $(REPO_DIR)\..\openvino_contrib
@@ -35,14 +43,13 @@ jobs:
    MSVC_COMPILER_PATH: C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Tools\MSVC\14.24.28314\bin\Hostx64\x64\cl.exe
    INSTALL_DIR: $(WORK_DIR)\install_pkg
    SETUPVARS: $(INSTALL_DIR)\bin\setupvars.bat
-    IB_DIR: C:\Program Files (x86)\IncrediBuild
-    IB_TESTCONSOLE: $(IB_DIR)\IBTestConsole.exe
-    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.2\opencv\bin;$(IB_DIR);%PATH%
+    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.2\opencv\bin;%PATH%

  steps:
  - script: |
      powershell -command "Invoke-RestMethod -Headers @{\"Metadata\"=\"true\"} -Method GET -Uri http://169.254.169.254/metadata/instance/compute?api-version=2019-06-01 | format-custom"
      where python3
+      python3 --version
      where python
      python --version
      where java
@@ -60,12 +67,6 @@ jobs:
      rd /Q /S $(BUILD_SAMPLES_DIR) & mkdir $(BUILD_SAMPLES_DIR)
    displayName: 'Make dir'

-  - script: |
-      certutil -urlcache -split -f https://openvinoweb.z5.web.core.windows.net/incredibuild/install_ib_console.bat install_ib_console.bat
-      call install_ib_console.bat
-    workingDirectory: $(WORK_DIR)
-    displayName: 'Install IncrediBuild'
-
  - checkout: self
    clean: true
    lfs: false
@@ -84,7 +85,8 @@ jobs:
    path: testdata

  - script: |
-      certutil -urlcache -split -f https://github.com/ninja-build/ninja/releases/download/v1.10.0/ninja-win.zip ninja-win.zip
+      rem Speed up build
+      certutil -urlcache -split -f https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-win.zip ninja-win.zip
      powershell -command "Expand-Archive -Force ninja-win.zip"
      git clone https://github.com/google/gtest-parallel.git
    workingDirectory: $(WORK_DIR)
@@ -96,9 +98,10 @@ jobs:
    workingDirectory: $(BUILD_DIR)
    displayName: 'CMake'

-  - script: |
-      set PATH=$(WORK_DIR)\ninja-win;%PATH%
-      call "$(MSVS_VARS_PATH)" && "C:\Program Files (x86)\IncrediBuild\BuildConsole.exe" /COMMAND="ninja"
+  - script: dir $(REPO_DIR)\inference-engine\temp\ /s
+    displayName: 'List temp SDKs'
+
+  - script: call "$(MSVS_VARS_PATH)" && $(WORK_DIR)\ninja-win\ninja
    workingDirectory: $(BUILD_DIR)
    displayName: 'Build Win'

@@ -120,6 +123,9 @@ jobs:
    workingDirectory: $(BUILD_SAMPLES_DIR)
    displayName: 'Build c samples'

+  - script: rd /Q /S $(BUILD_DIR)
+    displayName: 'Clean build dir'
+
  - script: |
      set PATH=$(TEST_ENV_PATH)
      $(BIN_DIR)\unit-test --gtest_print_time=1 --gtest_filter=-backend_api.config_unsupported:*IE_GPU* --gtest_output=xml:TEST-NGraphUT.xml
@@ -128,7 +134,7 @@ jobs:

  - script: |
      set PATH=$(TEST_ENV_PATH)
-      "$(IB_TESTCONSOLE)" $(BIN_DIR)\InferenceEngineUnitTests.exe --gtest_output=xml:TEST-InferenceEngineUnitTests-IB.xml
+      $(BIN_DIR)\InferenceEngineUnitTests.exe --gtest_output=xml:TEST-InferenceEngineUnitTests.xml
    displayName: 'IE UT old - IB'

  - script: |
@@ -175,9 +181,8 @@ jobs:

  - script: |
      set PATH=$(TEST_ENV_PATH)
-      rem $(BIN_DIR)\cpuFuncTests.exe --gtest_filter=*smoke* --gtest_output=xml:TEST-cpuFuncTests.xml
-      "$(IB_TESTCONSOLE)" $(BIN_DIR)\cpuFuncTests.exe --gtest_filter=*smoke*:-*CompareWithRefs/base_size=16_pre_nms_topn=100_post_nms_topn=100_nms_thresh=0.7_feat_stride=1_min_size=1_ratio* --gtest_output=xml:TEST-cpuFuncTests-IB.xml /testlevel=24
-    displayName: 'CPU FuncTests - IB'
+      $(BIN_DIR)\cpuFuncTests.exe --gtest_filter=*smoke* --gtest_output=xml:TEST-cpuFuncTests.xml
+    displayName: 'CPU FuncTests'
    continueOnError: false

  - script: |
@@ -200,8 +205,3 @@ jobs:
      buildPlatform: 'x64' # Optional
      buildConfiguration: 'Windows' # Optional
      #publishRunAttachments: true # Optional
-
-  - script: echo Stop IncrediBuild_Agent && net stop IncrediBuild_Agent
-    displayName: Stop IncrediBuild
-    continueOnError: true
-    enabled: false
--- a/.ci/azure/windows_conditional_compilation.yml
+++ b/.ci/azure/windows_conditional_compilation.yml
@@ -1,7 +1,16 @@
+trigger:
+  branches:
+    include:
+    - master
+    - releases/*
+  paths:
+    exclude:
+    - docs/*
+
 jobs:
 - job: WinCC
  # About 150% of total time
-  timeoutInMinutes: 120
+  timeoutInMinutes: 60

  pool:
    name: WIN_VMSS_VENV_F8S_WU2
@@ -10,26 +19,22 @@ jobs:
    system.debug: true
    VSTS_HTTP_RETRY: 5
    VSTS_HTTP_TIMEOUT: 200
-    WORKERS_NUMBER: 8
    BUILD_TYPE: Release
    REPO_DIR: $(Build.Repository.LocalPath)
    OPENVINO_CONTRIB_REPO_DIR: $(REPO_DIR)\..\openvino_contrib
    MODELS_PATH: $(REPO_DIR)\..\testdata
    WORK_DIR: $(Pipeline.Workspace)\_w
    BUILD_DIR: D:\build
-    BIN_DIR: $(REPO_DIR)\bin\intel64
    MSVS_VARS_PATH: C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat
    MSVC_COMPILER_PATH: C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Tools\MSVC\14.24.28314\bin\Hostx64\x64\cl.exe
    INSTALL_DIR: $(WORK_DIR)\install_pkg
    SETUPVARS: $(INSTALL_DIR)\bin\setupvars.bat
-    IB_DIR: C:\Program Files (x86)\IncrediBuild
-    IB_TESTCONSOLE: $(IB_DIR)\IBTestConsole.exe
-    TEST_ENV_PATH: $(REPO_DIR)\inference-engine\temp\tbb\bin;$(REPO_DIR)\inference-engine\temp\opencv_4.5.2\opencv\bin;$(IB_DIR);%PATH%

  steps:
  - script: |
      powershell -command "Invoke-RestMethod -Headers @{\"Metadata\"=\"true\"} -Method GET -Uri http://169.254.169.254/metadata/instance/compute?api-version=2019-06-01 | format-custom"
      where python3
+      python3 --version
      where python
      python --version
      where java
@@ -46,12 +51,6 @@ jobs:
      rd /Q /S $(BUILD_DIR) & mkdir $(BUILD_DIR)
    displayName: 'Make dir'

-  - script: |
-      certutil -urlcache -split -f https://openvinoweb.z5.web.core.windows.net/incredibuild/install_ib_console.bat install_ib_console.bat
-      call install_ib_console.bat
-    workingDirectory: $(WORK_DIR)
-    displayName: 'Install IncrediBuild'
-
  - checkout: self
    clean: true
    lfs: false
@@ -59,7 +58,8 @@ jobs:
    path: openvino

  - script: |
-      certutil -urlcache -split -f https://github.com/ninja-build/ninja/releases/download/v1.10.0/ninja-win.zip ninja-win.zip
+      rem Speed up build
+      certutil -urlcache -split -f https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-win.zip ninja-win.zip
      powershell -command "Expand-Archive -Force ninja-win.zip"
    workingDirectory: $(WORK_DIR)
    displayName: 'Install dependencies'
@@ -70,20 +70,19 @@ jobs:
    workingDirectory: $(BUILD_DIR)
    displayName: 'CMake'

-  - script: |
-      set PATH=$(WORK_DIR)\ninja-win;%PATH%
-      call "$(MSVS_VARS_PATH)" && "C:\Program Files (x86)\IncrediBuild\BuildConsole.exe" /COMMAND="ninja"
+  - script: dir $(REPO_DIR)\inference-engine\temp\ /s
+    displayName: 'List temp SDKs'
+
+  - script: call "$(MSVS_VARS_PATH)" && $(WORK_DIR)\ninja-win\ninja
    workingDirectory: $(BUILD_DIR)
-    displayName: 'Build Win'
+    displayName: 'Build Win CC'

  - script: dir $(REPO_DIR)\bin\ /s
-    displayName: 'List files'
+    displayName: 'List bin files'

  - script: cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_DIR) -P cmake_install.cmake
    workingDirectory: $(BUILD_DIR)
    displayName: 'Install'

-  - script: echo Stop IncrediBuild_Agent && net stop IncrediBuild_Agent
-    displayName: Stop IncrediBuild
-    continueOnError: true
-    enabled: false
+  - script: dir $(INSTALL_DIR) /s
+    displayName: 'List install files'
--- a/.github/org_control/check_pr.py
+++ b/.github/org_control/check_pr.py
@@ -139,7 +139,7 @@ def update_labels(gh_api, pull, non_org_intel_pr_users, non_org_pr_users):

 def get_wrong_commits(pull):
    """Returns commits with incorrect user and email"""
-    pr_author_email = pull.user.email.lower()
+    pr_author_email = (pull.user.email or "").lower()
    print("GitHub PR author email:", pr_author_email)
    print("Check commits:")
    wrong_commits = set()
@@ -147,21 +147,29 @@ def get_wrong_commits(pull):
        # import pprint; pprint.pprint(commit.raw_data)
        print("Commit SHA:", commit.sha)
        # Use raw data because commit author can be non GitHub user
-        commit_email = commit.raw_data["commit"]["author"]["email"].lower()
-        print("    Commit email:", commit_email)
+        commit_author_email = (commit.raw_data["commit"]["author"]["email"] or "").lower()
+        commit_committer_email = (commit.raw_data["commit"]["committer"]["email"] or "").lower()
+        print("    Commit author email:", commit_author_email)
+        print("    Commit committer email:", commit_committer_email)
        if not github_api.is_valid_user(commit.author):
            print(
-                "    ERROR: User with the commit email is absent in GitHub:",
+                "    ERROR: User with the commit author email is absent in GitHub:",
                commit.raw_data["commit"]["author"]["name"],
            )
            wrong_commits.add(commit.sha)
+        if not github_api.is_valid_user(commit.committer):
+            print(
+                "    ERROR: User with the commit committer email is absent in GitHub:",
+                commit.raw_data["commit"]["committer"]["name"],
+            )
+            wrong_commits.add(commit.sha)
        if not commit.raw_data["commit"]["verification"]["verified"]:
            print(
                "    WARNING: The commit is not verified. Reason:",
                commit.raw_data["commit"]["verification"]["reason"],
            )
-            if pr_author_email != commit_email:
-                print("    WARNING: Commit email and GitHub PR author public email are differnt")
+            if pr_author_email != commit_author_email or pr_author_email != commit_committer_email:
+                print("    WARNING: Commit emails and GitHub PR author public email are differnt")
    return wrong_commits


--- a/.github/workflows/build_doc.yml
+++ b/.github/workflows/build_doc.yml
@@ -14,6 +14,7 @@ jobs:

      - name: Install dependencies
        run: |
+          sudo apt update
          sudo apt --assume-yes install libusb-1.0-0-dev graphviz texlive
          python3 -m pip install lxml
          # install doxygen
@@ -32,10 +33,10 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake -DENABLE_DOCS=ON ..
+          echo "TBD"

      - name: Build doc
-        run: cmake --build . --target openvino_docs
+        run: echo "TBD"
        working-directory: build

      - name: 'Upload doc'
--- a/.github/workflows/code_style.yml
+++ b/.github/workflows/code_style.yml
@@ -10,10 +10,13 @@ jobs:
          submodules: recursive

      - name: Install clang-format-9
-        run: sudo apt --assume-yes install clang-format-9
+        run: |
+          sudo apt update
+          sudo apt --assume-yes install clang-format-9

      - name: Install dependencies
        run: |
+          sudo apt update
          sudo apt --assume-yes install libusb-1.0-0-dev
          python3 -m pip install --upgrade pip
          python3 -m pip install -r ./inference-engine/ie_bridges/python/requirements.txt
@@ -50,7 +53,9 @@ jobs:
          submodules: recursive

      - name: Install ShellCheck
-        run: sudo apt --assume-yes install shellcheck
+        run: |
+          sudo apt update
+          sudo apt --assume-yes install shellcheck

      - name: Install dependencies
        run: |
--- a/.github/workflows/mo.yml
+++ b/.github/workflows/mo.yml
@@ -41,6 +41,7 @@ jobs:
          pip install -r requirements.txt
          pip install -r requirements_dev.txt
          # requrements for CMake
+          sudo apt update
          sudo apt --assume-yes install libusb-1.0-0-dev
        working-directory: model-optimizer

--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@
 _*
 # but ensure we don't skip __init__.py
 !__init__.py
+# and sphinx documentation folders
+!docs/_*

 # developer tools
 *.idea
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,7 +90,7 @@ function(build_ngraph)
        ngraph_set(NGRAPH_PYTHON_BUILD_ENABLE OFF)
    endif()

-    if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+    if(OV_COMPILER_IS_CLANG)
        ie_add_compiler_flags(-Wno-error=uninitialized -Wno-error=literal-conversion)
    elseif(UNIX)
        ie_add_compiler_flags(-Wno-error=maybe-uninitialized -Wno-error=return-type)
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # OpenVINO™ Toolkit
-[![Stable release](https://img.shields.io/badge/version-2021.3-green.svg)](https://github.com/openvinotoolkit/openvino/releases/tag/2021.3)
+[![Stable release](https://img.shields.io/badge/version-2021.4.2-green.svg)](https://github.com/openvinotoolkit/openvino/releases/tag/2021.4.2)
 [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE)
 ![GitHub branch checks state](https://img.shields.io/github/checks-status/openvinotoolkit/openvino/master?label=GitHub%20checks)
 ![Azure DevOps builds (branch)](https://img.shields.io/azure-devops/build/openvinoci/b2bab62f-ab2f-4871-a538-86ea1be7d20f/13?label=Public%20CI)
@@ -42,7 +42,7 @@ Please report questions, issues and suggestions using:
 ---
 \* Other names and brands may be claimed as the property of others.

-[Open Model Zoo]:https://github.com/opencv/open_model_zoo
+[Open Model Zoo]:https://github.com/openvinotoolkit/open_model_zoo
 [Inference Engine]:https://software.intel.com/en-us/articles/OpenVINO-InferEngine
 [Model Optimizer]:https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer
 [nGraph]:https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_DevGuide.html
--- a/cmake/developer_package/compile_flags/sanitizer.cmake
+++ b/cmake/developer_package/compile_flags/sanitizer.cmake
@@ -17,7 +17,7 @@ if (ENABLE_SANITIZER)

    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
        set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fuse-ld=gold")
-    elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$" AND NOT WIN32)
+    elseif(OV_COMPILER_IS_CLANG AND NOT WIN32)
        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.0)
            set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fuse-ld=lld")
        endif()
@@ -35,7 +35,7 @@ if (ENABLE_THREAD_SANITIZER)
    set(SANITIZER_LINKER_FLAGS "-fsanitize=thread")
    set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -Wl,-z,nodelete")

-    if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$" AND NOT WIN32)
+    if(OV_COMPILER_IS_CLANG AND NOT WIN32)
        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.0)
            set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fuse-ld=lld")
        else()
--- a/cmake/developer_package/compile_flags/sdl.cmake
+++ b/cmake/developer_package/compile_flags/sdl.cmake
@@ -23,7 +23,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
            if (NOT ENABLE_SANITIZER)
                set(IE_C_CXX_FLAGS "${IE_C_CXX_FLAGS} -s")
            endif()
-        elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+        elseif(OV_COMPILER_IS_CLANG)
            set(IE_C_CXX_FLAGS "${IE_C_CXX_FLAGS} -fstack-protector-all")
        elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
            if (NOT ENABLE_SANITIZER)
--- a/cmake/developer_package/features.cmake
+++ b/cmake/developer_package/features.cmake
@@ -56,7 +56,7 @@ ie_option (VERBOSE_BUILD "shows extra information about build" OFF)

 ie_option (ENABLE_UNSAFE_LOCATIONS "skip check for MD5 for dependency" OFF)

-ie_dependent_option (ENABLE_FUZZING "instrument build for fuzzing" OFF "CMAKE_CXX_COMPILER_ID MATCHES ^(Apple)?Clang$; NOT WIN32" OFF)
+ie_dependent_option (ENABLE_FUZZING "instrument build for fuzzing" OFF "OV_COMPILER_IS_CLANG; NOT WIN32" OFF)

 #
 # Check features
--- a/cmake/developer_package/target_flags.cmake
+++ b/cmake/developer_package/target_flags.cmake
@@ -55,3 +55,9 @@ endif()
 if(UNIX AND NOT APPLE)
    set(LINUX ON)
 endif()
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+    set(OV_COMPILER_IS_CLANG ON)
+else()
+    set(OV_COMPILER_IS_CLANG OFF)
+endif()
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -44,327 +44,195 @@ if(NOT ENABLE_DOCKER)
    endforeach()
 endif()

-set(LINKCHECKER_PY "" CACHE FILEPATH "Path to linkchecker.py for documentation check")
-set(OMZ_DOCS_DIR "" CACHE PATH "Path to open_model_zoo documentation")
-set(WORKBENCH_DOCS_DIR "" CACHE PATH "Path to workbench documentation")
-set(POT_DOCS_DIR "" CACHE PATH "Path to post-training-compression-tool documentation")
-set(GST_DOCS_DIR "" CACHE PATH "Path to gst-video-analytics documentation")
+set(LINKCHECKER_PY "" CACHE FILEPATH "Path to linkchecker.py for documentation check dir.")
+set(OMZ_DOCS_DIR "" CACHE PATH "Path to open_model_zoo documentation dir.")
+set(WORKBENCH_DOCS_DIR "" CACHE PATH "Path to workbench documentation dir.")
+set(POT_DOCS_DIR "" CACHE PATH "Path to post-training-compression-tool documentation dir.")
+set(GST_DOCS_DIR "" CACHE PATH "Path to gst-video-analytics documentation dir.")
+set(GITHUB_API_TOKEN "" CACHE PATH "Path to file containing github api token.")
+set(GRAPH_CSV_DIR "" CACHE PATH "Path to the folder containing csv data for rendering graphs.")

 function(build_docs)
    find_package(Doxygen REQUIRED dot)
    find_package(PythonInterp 3 REQUIRED)
    find_package(LATEX REQUIRED)

-    execute_process(
-        COMMAND ${PYTHON_EXECUTABLE} -m pip show lxml
-        RESULT_VARIABLE PIP_EXIT_CODE
-        OUTPUT_QUIET
-    )
-
-    if (NOT ${PIP_EXIT_CODE} EQUAL 0)
-        message(FATAL_ERROR "lxml package is not installed. Please use \"pip install lxml\".")
+    find_program(DOXYREST_EXECUTABLE NAMES doxyrest)
+    if (NOT DOXYREST_EXECUTABLE)
+        message(FATAL_ERROR "No doxyrest found. Documentation output is not available")
    endif()

    set(DOCS_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    set(DOXYGEN_DIR "${OpenVINO_MAIN_SOURCE_DIR}/docs/doxygen")
-    set(IE_SOURCE_DIR "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine")
-    set(PYTHON_API_IN "${IE_SOURCE_DIR}/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx")
-    set(PYTHON_API_OUT "${DOCS_BUILD_DIR}/python_api/ie_api.pyx")
-    set(C_API "${IE_SOURCE_DIR}/ie_bridges/c/include")
-    set(PLUGIN_API_DIR "${DOCS_BUILD_DIR}/IE_PLUGIN_DG")
-    set(NGRAPH_DIR "${OpenVINO_MAIN_SOURCE_DIR}/ngraph")
-    set(NGRAPH_PY_DIR "${NGRAPH_DIR}/python/src/ngraph/")
-    set(NGRAPH_CPP_DIR "${NGRAPH_DIR}/core/include/" "${NGRAPH_DIR}/frontend/onnx_import/include")
+    set(DOCS_SOURCE_DIR "${OpenVINO_MAIN_SOURCE_DIR}/docs")
+    set(DOXYGEN_DIR "${DOCS_SOURCE_DIR}/doxygen")

+    # API INPUT
+
+    set(NGRAPH_DIR "${OpenVINO_MAIN_SOURCE_DIR}/ngraph")
+
+    # markdown docs
+    set(MARKDOWN_INPUT "${DOCS_BUILD_DIR}")
+    
+    # IE C++ API
+    set(IE_SOURCE_DIR "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine")
+    
+    # IE C API
+    set(IE_C_API "${IE_SOURCE_DIR}/ie_bridges/c/include")
+    
    # Preprocessing scripts
    set(DOXY_MD_FILTER "${DOXYGEN_DIR}/doxy_md_filter.py")
-    set(DOXY_LAYOUT_SCRIPT "${DOXYGEN_DIR}/build_main_layout.py")
+    set(PYNGRAPH_REF_SCRIPT "${DOXYGEN_DIR}/pyngraph_ref.py")
    set(DOXY_LOG_SCRIPT "${DOXYGEN_DIR}/log.py")
    set(PYX_FILTER "${DOXYGEN_DIR}/pyx_filter.py")
-
-    # assets dir
-    set(ASSETS_DIR "${DOXYGEN_DIR}/assets")
-    
-    # header and footer
-    set(HEADER_SOURCE "${DOXYGEN_DIR}/header.html.in")
-    set(FOOTER_SOURCE "${DOXYGEN_DIR}/footer.html.in")
-    set(HEADER_BUILD "${DOCS_BUILD_DIR}/header.html")
-    set(FOOTER_BUILD "${DOCS_BUILD_DIR}/footer.html")
-
-    configure_file(${HEADER_SOURCE} ${HEADER_BUILD} @ONLY)
-    configure_file(${FOOTER_SOURCE} ${FOOTER_BUILD} @ONLY)
-
-    file(GLOB_RECURSE doc_source_files
-        LIST_DIRECTORIES true RELATIVE ${OpenVINO_MAIN_SOURCE_DIR}
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.md"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.png"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.gif"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.jpg"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.svg"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.md"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.png"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.gif"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.jpg"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.svg")
-
-    configure_file(${PYTHON_API_IN} ${PYTHON_API_OUT} @ONLY)
-
-    set(NGRAPH_CPP_CONFIG_SOURCE "${DOXYGEN_DIR}/ngraph_cpp_api.config")
-    set(NGRAPH_PY_CONFIG_SOURCE "${DOXYGEN_DIR}/ngraph_py_api.config")
-    set(IE_CONFIG_SOURCE "${DOXYGEN_DIR}/ie_docs.config")
-    set(C_CONFIG_SOURCE "${DOXYGEN_DIR}/ie_c_api.config")
-    set(PY_CONFIG_SOURCE "${DOXYGEN_DIR}/ie_py_api.config")
-    set(PLUGIN_CONFIG_SOURCE "${DOXYGEN_DIR}/ie_plugin_api.config")
-
-    set(NGRAPH_CPP_CONFIG_BUILD "${DOCS_BUILD_DIR}/ngraph_cpp_api.config")
-    set(NGRAPH_PY_CONFIG_BUILD "${DOCS_BUILD_DIR}/ngraph_py_api.config")
-    set(IE_CONFIG_BUILD "${DOCS_BUILD_DIR}/ie_docs.config")
-    set(C_CONFIG_BUILD "${DOCS_BUILD_DIR}/ie_c_api.config")
-    set(PY_CONFIG_BUILD "${DOCS_BUILD_DIR}/ie_py_api.config")
-    set(PLUGIN_CONFIG_BUILD "${DOCS_BUILD_DIR}/ie_plugin_api.config")
-
-    set(NGRAPH_CPP_LAYOUT_SOURCE "${DOXYGEN_DIR}/ngraph_cpp_api.xml")
-    set(NGRAPH_PY_LAYOUT_SOURCE "${DOXYGEN_DIR}/ngraph_py_api.xml")
-    set(IE_LAYOUT_SOURCE "${DOXYGEN_DIR}/ie_docs.xml")
-    set(OPENVINO_LAYOUT_SOURCE "${DOXYGEN_DIR}/openvino_docs.xml")
-    set(C_LAYOUT_SOURCE "${DOXYGEN_DIR}/ie_c_api.xml")
-    set(PY_LAYOUT_SOURCE "${DOXYGEN_DIR}/ie_py_api.xml")
-    set(PLUGIN_LAYOUT_SOURCE "${DOXYGEN_DIR}/ie_plugin_api.xml")
-
-    set(NGRAPH_CPP_LAYOUT_BUILD "${DOCS_BUILD_DIR}/ngraph_cpp_api.xml")
-    set(NGRAPH_PY_LAYOUT_BUILD "${DOCS_BUILD_DIR}/ngraph_py_api.xml")
-    set(IE_LAYOUT_BUILD "${DOCS_BUILD_DIR}/ie_docs.xml")
-    set(OPENVINO_LAYOUT_BUILD "${DOCS_BUILD_DIR}/openvino_docs.xml")
-    set(C_LAYOUT_BUILD "${DOCS_BUILD_DIR}/ie_c_api.xml")
-    set(PY_LAYOUT_BUILD "${DOCS_BUILD_DIR}/ie_py_api.xml")
-    set(PLUGIN_LAYOUT_BUILD "${DOCS_BUILD_DIR}/ie_plugin_api.xml")
+    set(PREPARE_XML_SCRIPT "${DOXYGEN_DIR}/prepare_xml.py")
+    set(REMOVE_XML_SCRIPT "${DOXYGEN_DIR}/remove_xml.py")
+    set(COPY_IMAGES_SCRIPT "${DOXYGEN_DIR}/copy_images.py")
+    set(DOXYGEN_MAPPING_SCRIPT "${DOXYGEN_DIR}/create_mapping.py")
+    set(DOXYGEN_MAPPING_FILE "${DOCS_BUILD_DIR}/mapping.json")

    # out dirs
-    set(OUTPUT_DIRECTORY "${DOCS_BUILD_DIR}/html")
-    set(IE_OUTPUT "${OUTPUT_DIRECTORY}")
-    set(C_OUTPUT "${OUTPUT_DIRECTORY}/ie_c_api")
-    set(PY_OUTPUT "${OUTPUT_DIRECTORY}/ie_python_api")
-    set(PLUGIN_OUTPUT "${OUTPUT_DIRECTORY}/ie_plugin_api")
-    set(NGRAPH_CPP_OUTPUT "${OUTPUT_DIRECTORY}/ngraph_cpp_api")
-    set(NGRAPH_PY_OUTPUT "${OUTPUT_DIRECTORY}/ngraph_python_api")
+    set(XML_OUTPUT "${DOCS_BUILD_DIR}/xml")
+    set(RST_OUTPUT "${DOCS_BUILD_DIR}/rst")
+    set(SPHINX_OUTPUT "${DOCS_BUILD_DIR}/_build")

-    # Tables of contents
-    configure_file(${NGRAPH_CPP_LAYOUT_SOURCE} ${NGRAPH_CPP_LAYOUT_BUILD} @ONLY)
-    configure_file(${NGRAPH_PY_LAYOUT_SOURCE} ${NGRAPH_PY_LAYOUT_BUILD} @ONLY)
-    configure_file(${IE_LAYOUT_SOURCE} ${IE_LAYOUT_BUILD} @ONLY)
-    configure_file(${OPENVINO_LAYOUT_SOURCE} ${OPENVINO_LAYOUT_BUILD} @ONLY)
-    configure_file(${C_LAYOUT_SOURCE} ${C_LAYOUT_BUILD} @ONLY)
-    configure_file(${PY_LAYOUT_SOURCE} ${PY_LAYOUT_BUILD} @ONLY)
-    configure_file(${PLUGIN_LAYOUT_SOURCE} ${PLUGIN_LAYOUT_BUILD} @ONLY)
+    # Sphinx folders, doxyrest templates and config
+    set(SPHINX_CONF_IN "${DOCS_SOURCE_DIR}/conf.py")
+    set(SPHINX_CONF_OUT "${RST_OUTPUT}/conf.py")
+    set(SPHINX_STATIC_IN "${DOCS_SOURCE_DIR}/_static")
+    set(SPHINX_STATIC_OUT "${RST_OUTPUT}/_static")
+    set(SPHINX_INDEX_IN "${DOCS_SOURCE_DIR}/index.rst")
+    set(SPHINX_INDEX_OUT "${RST_OUTPUT}/index.rst")
+    set(API_DOCS_IN "${DOCS_SOURCE_DIR}/api")
+    set(API_DOCS_OUT "${RST_OUTPUT}/api")
+    set(DOXYREST_IN "${DOCS_SOURCE_DIR}/doxyrest")
+    set(DOXYREST_OUT "${DOCS_BUILD_DIR}/doxyrest")
+    set(DOXYREST_SPHINX_IN "${DOCS_SOURCE_DIR}/doxyrest-sphinx")
+    set(DOXYREST_SPHINX_OUT "${RST_OUTPUT}/doxyrest-sphinx")
+    set(DOXYREST_CONFIG_IN "${DOCS_SOURCE_DIR}/doxyrest-config.lua")
+    set(DOXYREST_CONFIG_OUT "${DOCS_BUILD_DIR}/doxyrest-config.lua")
+    configure_file(${DOXYREST_CONFIG_IN} ${DOXYREST_CONFIG_OUT} @ONLY)
+    configure_file(${SPHINX_CONF_IN} ${SPHINX_CONF_OUT} @ONLY)

-    # Doxygen config files
-    configure_file(${NGRAPH_CPP_CONFIG_SOURCE} ${NGRAPH_CPP_CONFIG_BUILD} @ONLY)
-    configure_file(${NGRAPH_PY_CONFIG_SOURCE} ${NGRAPH_PY_CONFIG_BUILD} @ONLY)
-    configure_file(${IE_CONFIG_SOURCE} ${IE_CONFIG_BUILD} @ONLY)
-    configure_file(${C_CONFIG_SOURCE} ${C_CONFIG_BUILD} @ONLY)
-    configure_file(${PY_CONFIG_SOURCE} ${PY_CONFIG_BUILD} @ONLY)
-    configure_file(${PLUGIN_CONFIG_SOURCE} ${PLUGIN_CONFIG_BUILD} @ONLY)
+    # Doxygen config
+    set(DOXYFILE_SOURCE "${DOXYGEN_DIR}/Doxyfile.config")
+    set(DOXYFILE_BUILD "${DOCS_BUILD_DIR}/Doxyfile.config")
+    configure_file(${DOXYFILE_SOURCE} ${DOXYFILE_BUILD} @ONLY)

-    # Preprocessing scripts
-    set(DOXY_MD_FILTER "${DOXYGEN_DIR}/doxy_md_filter.py")
-    set(PYX_FILTER "${DOXYGEN_DIR}/pyx_filter.py")
+    list(APPEND commands COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER}
+        --input_dir=${OpenVINO_MAIN_SOURCE_DIR}
+        --output_dir=${DOCS_BUILD_DIR}/openvino
+        --exclude_dir=${CMAKE_CURRENT_BINARY_DIR}
+        --exclude_dir=${OpenVINO_MAIN_SOURCE_DIR}/out)

-    # nGraph C++ API
+    # include additional repositories

-    add_custom_target(ngraph_cpp_api
-                      COMMAND ${CMAKE_COMMAND} -E copy_directory ${ASSETS_DIR} ${NGRAPH_CPP_OUTPUT}/assets
-                      COMMAND ${DOXYGEN_EXECUTABLE} ${NGRAPH_CPP_CONFIG_BUILD}
-                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
-                      VERBATIM)
+    # openvino notebooks dir
+    if(GITHUB_API_TOKEN)
+        set(NBDOC_SCRIPT "${DOCS_SOURCE_DIR}/nbdoc/nbdoc.py")
+        list(APPEND commands
+            COMMAND ${PYTHON_EXECUTABLE} "${NBDOC_SCRIPT}" "${GITHUB_API_TOKEN}" "${RST_OUTPUT}/notebooks"
+        )
+    endif()

-    # nGraph Python API
-
-    add_custom_target(ngraph_py_api
-                      COMMAND ${CMAKE_COMMAND} -E copy_directory ${ASSETS_DIR} ${NGRAPH_PY_OUTPUT}/assets
-                      COMMAND ${DOXYGEN_EXECUTABLE} ${NGRAPH_PY_CONFIG_BUILD}
-                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
-                      VERBATIM)
-
-    # C API
-
-    add_custom_target(c_api
-                      COMMAND ${CMAKE_COMMAND} -E copy_directory ${ASSETS_DIR} ${C_OUTPUT}/assets
-                      COMMAND ${DOXYGEN_EXECUTABLE} ${C_CONFIG_BUILD}
-                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
-                      COMMENT "Generating C API Reference"
-                      VERBATIM)
-
-    # Python API
-
-    add_custom_target(py_api
-                      COMMAND ${CMAKE_COMMAND} -E copy_directory ${ASSETS_DIR} ${PY_OUTPUT}/assets
-                      COMMAND ${DOXYGEN_EXECUTABLE} ${PY_CONFIG_BUILD}
-                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
-                      COMMENT "Generating Python API Reference"
-                      VERBATIM)
-
-    add_custom_command(TARGET py_api
-                       PRE_BUILD
-                       COMMAND ${PYTHON_EXECUTABLE} ${PYX_FILTER} ${PYTHON_API_OUT}
-                       COMMENT "Pre-process Python API")
-
-    # Preprocess docs
-
-    add_custom_target(preprocess_docs
-                      COMMENT "Pre-process docs"
-                      VERBATIM)
-
-    # ovino doc files
-    file(GLOB_RECURSE ovino_doc_files
-        LIST_DIRECTORIES true RELATIVE ${OpenVINO_MAIN_SOURCE_DIR}
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.md"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.png"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.gif"
-        "${OpenVINO_MAIN_SOURCE_DIR}/docs/*.jpg"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.md"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.png"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.gif"
-        "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine/*.jpg")
-
-    foreach(source_file ${ovino_doc_files})
-        list(APPEND commands COMMAND ${CMAKE_COMMAND} -E copy
-            "${OpenVINO_MAIN_SOURCE_DIR}/${source_file}" "${DOCS_BUILD_DIR}/openvino/${source_file}")
-    endforeach()
+    if(GRAPH_CSV_DIR)
+        set(GRAPH_CSV_DIR_OUT "${RST_OUTPUT}/csv")
+    list(APPEND commands
+        COMMAND ${CMAKE_COMMAND} -E copy_directory "${GRAPH_CSV_DIR}" "${GRAPH_CSV_DIR_OUT}"
+    )
+    endif()

    # omz doc files
    if(EXISTS "${OMZ_DOCS_DIR}")
        get_filename_component(OMZ_DOCS_DIR "${OMZ_DOCS_DIR}" ABSOLUTE)
-
-        file(GLOB_RECURSE omz_doc_files
-            LIST_DIRECTORIES true RELATIVE ${OMZ_DOCS_DIR}
-            "${OMZ_DOCS_DIR}/*.md"
-            "${OMZ_DOCS_DIR}/*.png"
-            "${OMZ_DOCS_DIR}/*.gif"
-            "${OMZ_DOCS_DIR}/*.jpg")
-
-        foreach(source_file ${omz_doc_files})
-            list(APPEND commands COMMAND ${CMAKE_COMMAND} -E copy
-                "${OMZ_DOCS_DIR}/${source_file}" "${DOCS_BUILD_DIR}/omz/${source_file}")
-        endforeach()
-        configure_file("${OMZ_DOCS_DIR}/omz_docs.xml" "${DOCS_BUILD_DIR}/omz_docs.xml" @ONLY)
+        list(APPEND commands
+        COMMAND ${PYTHON_EXECUTABLE} ${OMZ_DOCS_DIR}/ci/prepare-documentation.py ${CMAKE_BINARY_DIR}/open_model_zoo)
+        list(APPEND commands COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER}
+            --input_dir=${CMAKE_BINARY_DIR}/open_model_zoo
+            --output_dir=${DOCS_BUILD_DIR}/open_model_zoo)
    endif()

    # workbench doc files
    if(EXISTS "${WORKBENCH_DOCS_DIR}")
        get_filename_component(WORKBENCH_DOCS_DIR "${WORKBENCH_DOCS_DIR}" ABSOLUTE)

-        file(GLOB_RECURSE workbench_doc_files
-            LIST_DIRECTORIES true RELATIVE ${WORKBENCH_DOCS_DIR}
-            "${WORKBENCH_DOCS_DIR}/*.md"
-            "${WORKBENCH_DOCS_DIR}/*.png"
-            "${WORKBENCH_DOCS_DIR}/*.gif"
-            "${WORKBENCH_DOCS_DIR}/*.jpg")
-
-        foreach(source_file ${workbench_doc_files})
-            list(APPEND commands COMMAND ${CMAKE_COMMAND} -E copy
-                "${WORKBENCH_DOCS_DIR}/${source_file}" "${DOCS_BUILD_DIR}/workbench/${source_file}")
-        endforeach()
-        configure_file("${WORKBENCH_DOCS_DIR}/docs/Workbench_DG/workbench_docs.xml" "${DOCS_BUILD_DIR}/workbench_docs.xml" @ONLY)
+        list(APPEND commands COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER}
+            --input_dir=${WORKBENCH_DOCS_DIR}
+            --output_dir=${DOCS_BUILD_DIR}/workbench)
    endif()

    # pot doc files
    if(EXISTS "${POT_DOCS_DIR}")
        get_filename_component(POT_DOCS_DIR "${POT_DOCS_DIR}" ABSOLUTE)

-        file(GLOB_RECURSE pot_doc_files
-            LIST_DIRECTORIES true RELATIVE ${POT_DOCS_DIR}
-            "${POT_DOCS_DIR}/*.md"
-            "${POT_DOCS_DIR}/*.png"
-            "${POT_DOCS_DIR}/*.gif"
-            "${POT_DOCS_DIR}/*.jpg")
-
-        foreach(source_file ${pot_doc_files})
-            list(APPEND commands COMMAND ${CMAKE_COMMAND} -E copy
-                "${POT_DOCS_DIR}/${source_file}" "${DOCS_BUILD_DIR}/pot/${source_file}")
-        endforeach()
-        configure_file("${POT_DOCS_DIR}/docs/pot_docs.xml" "${DOCS_BUILD_DIR}/pot_docs.xml" @ONLY)
+        list(APPEND commands COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER}
+        --input_dir=${POT_DOCS_DIR}
+        --output_dir=${DOCS_BUILD_DIR}/pot)
    endif()

    # gst doc files
    if(EXISTS "${GST_DOCS_DIR}")
        get_filename_component(GST_DOCS_DIR "${GST_DOCS_DIR}" ABSOLUTE)
-
-        file(GLOB_RECURSE gst_doc_files
-            LIST_DIRECTORIES true RELATIVE ${GST_DOCS_DIR}
-            "${GST_DOCS_DIR}/*.md"
-            "${GST_DOCS_DIR}/*.png"
-            "${GST_DOCS_DIR}/*.gif"
-            "${GST_DOCS_DIR}/*.jpg")
-
-        foreach(source_file ${gst_doc_files})
-            list(APPEND commands COMMAND ${CMAKE_COMMAND} -E copy
-                "${GST_DOCS_DIR}/${source_file}" "${DOCS_BUILD_DIR}/gst/${source_file}")
-        endforeach()
+        list(APPEND commands COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER}
+        --input_dir=${GST_DOCS_DIR}
+        --output_dir=${DOCS_BUILD_DIR}/gst)
    endif()

+    add_custom_target(preprocess_docs
+                     COMMENT "Preprocess documentation"
+                     VERBATIM)
+
+    # Preprocess docs
    add_custom_command(TARGET preprocess_docs
-                       PRE_BUILD
-                       ${commands}
-                       COMMAND ${PYTHON_EXECUTABLE} ${DOXY_LAYOUT_SCRIPT} --openvino ${OPENVINO_LAYOUT_BUILD}
-                       COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER} ${DOCS_BUILD_DIR}
-                       COMMENT "Pre-process markdown and image links")
-
-    # IE dev guide and C++ API
-
-    add_custom_target(ie_docs
-                      DEPENDS ngraph_cpp_api preprocess_docs
-                      COMMAND ${CMAKE_COMMAND} -E copy_directory ${ASSETS_DIR} ${IE_OUTPUT}/assets
-                      COMMAND ${DOXYGEN_EXECUTABLE} ${IE_CONFIG_BUILD}
-                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
-                      VERBATIM)
-
-    # Plugin API
-
-    add_custom_target(plugin_api
-                      DEPENDS ngraph_cpp_api ie_docs
-                      COMMAND ${CMAKE_COMMAND} -E copy_directory ${ASSETS_DIR} ${PLUGIN_OUTPUT}/assets
-                      COMMAND ${DOXYGEN_EXECUTABLE} ${PLUGIN_CONFIG_BUILD}
-                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
-                      COMMENT "Generating Plugin API Reference"
-                      VERBATIM)
-
-    # Umbrella OpenVINO target
-
-    add_custom_target(openvino_docs
-                      DEPENDS ngraph_cpp_api ngraph_py_api c_api py_api ie_docs plugin_api
-                      COMMENT "Generating OpenVINO documentation"
-                      VERBATIM)
-
-    set_target_properties(openvino_docs ie_docs c_api py_api preprocess_docs plugin_api
-                          ngraph_py_api ngraph_cpp_api
-                          PROPERTIES FOLDER docs)
-
-    add_custom_command(TARGET openvino_docs
                       POST_BUILD
-                       COMMAND ${PYTHON_EXECUTABLE} ${DOXY_LOG_SCRIPT} --log "${DOCS_BUILD_DIR}/ie_docs.log"
-                                                                        --include_omz $<BOOL:${OMZ_DOCS_DIR}>
-                                                                        --include_wb $<BOOL:${WORKBENCH_DOCS_DIR}>
-                                                                        --include_pot $<BOOL:${POT_DOCS_DIR}>
-                                                                        --include_gst $<BOOL:${GST_DOCS_DIR}>
-                       COMMENT "Parse doxygen log to find errors."
+                       ${commands}
+                       WORKING_DIRECTORY ${DOCS_BUILD_DIR}
+                       COMMENT "Preprocess documentation"
                       VERBATIM)

-    # added linkcheker
+    add_custom_target(doxygen_xml
+                      DEPENDS preprocess_docs
+                      COMMAND ${PYTHON_EXECUTABLE} ${REMOVE_XML_SCRIPT} ${XML_OUTPUT}
+                      COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYFILE_BUILD}
+                      WORKING_DIRECTORY ${DOCS_BUILD_DIR}
+                      COMMENT "Generate doxygen XML output"
+                      VERBATIM)
+
+    # Post-process docs
+    add_custom_command(TARGET doxygen_xml
+                       POST_BUILD
+                       COMMAND ${PYTHON_EXECUTABLE} ${PREPARE_XML_SCRIPT} ${XML_OUTPUT}
+                       COMMAND ${CMAKE_COMMAND} -E copy_directory ${DOXYREST_IN} ${DOXYREST_OUT}
+                       COMMAND ${DOXYREST_EXECUTABLE} -c ${DOXYREST_CONFIG_OUT}
+                       COMMAND ${PYTHON_EXECUTABLE} ${COPY_IMAGES_SCRIPT} ${XML_OUTPUT} ${RST_OUTPUT}
+                       COMMAND ${PYTHON_EXECUTABLE} ${DOXYGEN_MAPPING_SCRIPT} ${XML_OUTPUT} ${DOCS_BUILD_DIR} ${OpenVINO_MAIN_SOURCE_DIR}/../
+                       COMMAND ${CMAKE_COMMAND} -E copy ${SPHINX_INDEX_IN} ${SPHINX_INDEX_OUT}
+                       COMMAND ${CMAKE_COMMAND} -E copy_directory ${API_DOCS_IN} ${API_DOCS_OUT}
+                       COMMAND ${CMAKE_COMMAND} -E copy_directory ${DOXYREST_IN} ${DOXYREST_OUT}
+                       COMMAND ${CMAKE_COMMAND} -E copy_directory ${DOXYREST_SPHINX_IN} ${DOXYREST_SPHINX_OUT}
+                       COMMAND ${CMAKE_COMMAND} -E copy_directory ${SPHINX_STATIC_IN} ${SPHINX_STATIC_OUT}
+                       COMMENT "Prepare xml"
+                       VERBATIM)
+
+    add_custom_target(sphinx_docs
+                      DEPENDS doxygen_xml
+                      COMMAND sphinx-build -b html ${RST_OUTPUT} ${SPHINX_OUTPUT}
+                      WORKING_DIRECTORY ${RST_OUTPUT}
+                      VERBATIM)
+
+    add_custom_target(linkcheck
+                      COMMAND sphinx-build -b linkcheck ${RST_OUTPUT} ${SPHINX_OUTPUT}
+                      WORKING_DIRECTORY ${RST_OUTPUT}
+                      VERBATIM)
+
+    set_target_properties(doxygen_xml sphinx_docs
+                          PROPERTIES FOLDER docs)

-    if(EXISTS "${LINKCHECKER_PY}")
-        add_custom_target(docs_check
-                            COMMAND ${PYTHON_EXECUTABLE} "${LINKCHECKER_PY}" -v "${DOCS_BUILD_DIR}/html/"
-                            COMMENT "Check links in generated documentation"
-                            WORKING_DIRECTORY "${DOCS_BUILD_DIR}"
-                            VERBATIM)
-        set_target_properties(docs_check PROPERTIES FOLDER docs)
-    endif()

    find_program(browser NAMES xdg-open)
    if(browser)
        add_custom_target(ie_docs_open
-                          COMMAND ${browser} "${OpenVINO_MAIN_SOURCE_DIR}/docs/html/index.html"
-                          DEPENDS ie_docs
+                          COMMAND ${browser} "${SPHINX_OUTPUT}/index.html"
+                          DEPENDS sphinx_docs
                          COMMENT "Open OpenVINO documentation"
                          VERBATIM)
        set_target_properties(ie_docs_open PROPERTIES FOLDER docs)
--- a/docs/HOWTO/Custom_Layers_Guide.md
+++ b/docs/HOWTO/Custom_Layers_Guide.md
@@ -12,7 +12,7 @@ Representation (IR) for this model.
 This guide illustrates the workflow for running inference on topologies featuring custom operations, allowing you to
 plug in your own implementation for existing or completely new operation.

-> **NOTE:** *Layer* — The legacy term for an *operation* which came from Caffe\* framework. Currently it is not used.
+> **NOTE**: *Layer* — The legacy term for an *operation* which came from Caffe\* framework. Currently it is not used.
 > Refer to the [Deep Learning Network Intermediate Representation and Operation Sets in OpenVINO™](../MO_DG/IR_and_opsets.md)
 > for more information on the topic.

@@ -44,7 +44,7 @@ plugins to support inference of this operation using a particular target hardwar
 To see the operations that are supported by each device plugin for the Inference Engine, refer to the
 [Supported Devices](../IE_DG/supported_plugins/Supported_Devices.md).

-> **NOTE:** If a device doesn't support a particular operation, an alternative to creating a new operation is to target
+> **NOTE**: If a device doesn't support a particular operation, an alternative to creating a new operation is to target
 > an additional device using the HETERO plugin. The [Heterogeneous Plugin](../IE_DG/supported_plugins/HETERO.md) may be
 > used to run an inference model on multiple devices allowing the unsupported operations on one device to "fallback" to
 > run on another device (e.g., CPU) that does support those operations.
@@ -63,7 +63,7 @@ operation and uses corresponding operation class to update graph node attributes
 operation. Refer to the "Operation Extractor" section of
 [Model Optimizer Extensibility](../MO_DG/prepare_model/customize_model_optimizer/Customize_Model_Optimizer.md) for detailed instructions on how to implement it.

-> **NOTE:** In some cases you may need to implement some transformation to support the operation. This topic is covered in the "Graph Transformation Extensions" section of [Model Optimizer Extensibility](../MO_DG/prepare_model/customize_model_optimizer/Customize_Model_Optimizer.md).
+> **NOTE**: In some cases you may need to implement some transformation to support the operation. This topic is covered in the "Graph Transformation Extensions" section of [Model Optimizer Extensibility](../MO_DG/prepare_model/customize_model_optimizer/Customize_Model_Optimizer.md).

 ## Custom Operations Extensions for the Inference Engine

@@ -131,15 +131,26 @@ Firstly, open the model in the TensorBoard or other TensorFlow* model visualizat
 batch dimension because the value for the batch dimension is not hardcoded in the model. Model Optimizer need to set all
 dynamic dimensions to some specific value to create the IR, therefore specify the command line parameter `-b 1` to set
 the batch dimension equal to 1. The actual batch size dimension can be changed at runtime using the Inference Engine API
-described in the [Using Shape Inference](../IE_DG/ShapeInference.md). Also refer to
-[Converting a Model Using General Conversion Parameters](../MO_DG/prepare_model/convert_model/Converting_Model_General.md)
-and [Convert Your TensorFlow* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md)
+described in the [Using Shape Inference](../IE_DG/ShapeInference.md). Also refer to the General Conversion Parameters section in [Converting a Model to Intermediate Representation (IR)](../MO_DG/prepare_model/convert_model/Converting_Model.md) and [Convert Your TensorFlow* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md)
 for more details and command line parameters used for the model conversion.

-```bash
-./<MO_INSTALL_DIR>/mo.py --input_model <PATH_TO_MODEL>/wnet_20.pb -b 1
-```
-> **NOTE:** This conversion guide is applicable for the 2021.3 release of OpenVINO and that starting from 2021.4
+@sphinxdirective
+.. tab:: Package, Docker, open-source installation
+
+   .. code-block:: sh
+
+      cd <INSTALL_DIR>/deployment_tools/model_optimizer/
+      python3 mo.py --input_model <PATH_TO_MODEL>/wnet_20.pb -b 1
+
+.. tab:: pip installation
+
+    .. code-block:: sh
+
+      mo --input_model <PATH_TO_MODEL>/wnet_20.pb -b 1
+
+@endsphinxdirective
+
+> **NOTE**: This conversion guide is applicable for the 2021.3 release of OpenVINO and that starting from 2021.4
 > the OpenVINO supports this model out of the box.

 Model Optimizer produces the following error:
@@ -221,7 +232,7 @@ following snippet provides two extractors: one for "IFFT2D", another one for "FF

@snippet FFT_ext.py fft_ext:extractor

-> **NOTE:** The graph is in inconsistent state after extracting node attributes because according to original operation
+> **NOTE**: The graph is in inconsistent state after extracting node attributes because according to original operation
 > "IFFT2D" semantic it should have an input consuming a tensor of complex numbers, but the extractor instantiated an
 > operation "FFT" which expects a real tensor with specific layout. But the inconsistency will be resolved during
 > applying front phase transformations discussed below.
@@ -239,7 +250,7 @@ information on how this type of transformation works. The code snippet should be

@snippet Complex.py complex:transformation

-> **NOTE:** The graph is in inconsistent state because the "ComplexAbs" operation consumes complex value tensor but
+> **NOTE**: The graph is in inconsistent state because the "ComplexAbs" operation consumes complex value tensor but
 >  "FFT" produces real value tensor.

 Now lets implement a transformation which replace a "ComplexAbs" operation with a sub-graph of primitive operations
@@ -257,15 +268,27 @@ The implementation should be saved to the file `mo_extensions/front/tf/ComplexAb
@snippet ComplexAbs.py complex_abs:transformation

 Now it is possible to convert the model using the following command line:
-```bash
-./<MO_INSTALL_DIR>/mo.py --input_model <PATH_TO_MODEL>/wnet_20.pb -b 1 --extensions mo_extensions/
-```
+@sphinxdirective
+.. tab:: Package, Docker, open-source installation
+
+   .. code-block:: sh
+
+      cd <INSTALL_DIR>/deployment_tools/model_optimizer/
+      python3 mo.py --input_model <PATH_TO_MODEL>/wnet_20.pb -b 1 --extensions mo_extensions/
+
+.. tab:: pip installation
+
+    .. code-block:: sh
+
+      mo --input_model <PATH_TO_MODEL>/wnet_20.pb -b 1 --extensions mo_extensions/
+
+@endsphinxdirective

 The sub-graph corresponding to the originally non-supported one is depicted in the image below:

 ![Converted sub-graph](img/converted_subgraph.png)

-> **NOTE:** Model Optimizer performed conversion of the model from NHWC to NCHW layout that is why the dimension with
+> **NOTE**: Model Optimizer performed conversion of the model from NHWC to NCHW layout that is why the dimension with
 > the value 2 moved to another position.

 ### Inference Engine Extension Implementation
@@ -350,7 +373,7 @@ python3 mri_reconstruction_demo.py \
 ## Converting Models:

 - [Convert Your Caffe* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_Caffe.md)
- [Convert Your Kaldi* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_Kaldi.md)
 - [Convert Your TensorFlow* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md)
 - [Convert Your MXNet* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_MxNet.md)
+- [Convert Your Kaldi* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_Kaldi.md)
 - [Convert Your ONNX* Model](../MO_DG/prepare_model/convert_model/Convert_Model_From_ONNX.md)
--- a/docs/IE_DG/API_Changes.md
+++ b/docs/IE_DG/API_Changes.md
@@ -10,10 +10,14 @@ The sections below contain detailed list of changes made to the Inference Engine

 ### Deprecated API

+ **InferenceEngine::Parameter**
+
 * InferenceEngine::Parameter(const std::shared_ptr<ngraph::Variant>&)
 * InferenceEngine::Parameter(std::shared_ptr<ngraph::Variant>& var)
 * std::shared_ptr<ngraph::Variant> InferenceEngine::Parameter::asVariant() const
 * InferenceEngine::Parameter::operator std::shared_ptr<ngraph::Variant>() const
+
+ **GPU plugin configuration keys**
 * KEY_CLDNN_NV12_TWO_INPUTS GPU plugin option. Use KEY_GPU_NV12_TWO_INPUTS instead
 * KEY_CLDNN_PLUGIN_PRIORITY GPU plugin option. Use KEY_GPU_PLUGIN_PRIORITY instead
 * KEY_CLDNN_PLUGIN_THROTTLE GPU plugin option. Use KEY_GPU_PLUGIN_THROTTLE instead
@@ -24,6 +28,38 @@ The sections below contain detailed list of changes made to the Inference Engine
 * KEY_TUNING_MODE GPU plugin option
 * KEY_TUNING_FILE GPU plugin option

+ **InferenceEngine::IInferRequest**
+ * IInferRequest interface is deprecated, use InferRequest wrapper:
+  * Constructor for InferRequest from IInferRequest:: Ptr is deprecated
+  * Cast operator for InferRequest to IInferRequest shared pointer is deprecated
+
+ **InferenceEngine::ICNNNetwork**
+ * ICNNNetwork interface is deprecated by means of deprecation of all its methods, use CNNNetwork wrapper
+  * CNNNetwork methods working with ICNNNetwork are deprecated:
+  * Cast to ICNNNetwork shared pointer
+  * Cast to reference to ICNNNetwork interface
+  * Constructor from ICNNNetwork shared pointer
+
+ **InferenceEngine::IExecutableNetwork**
+ * IExecutableNetwork is deprecated, use ExecutableNetwork wrappers:
+  * Constructor of ExecutableNetwork from IExecutableNetwork shared pointer is deprecated
+ * The following ExecutableNetwork methods are deprecated:
+  * ExecutableNetwork::reset
+  * Cast operator to IExecutableNetwork shared pointer
+  * ExecutableNetwork::CreateInferRequestPtr - use ExecutableNetwork::CreateInferRequest instead
+
+ **Extensions API**
+ * InferenceEngine::make_so_pointer which is used to create Extensions library is replaced by std::make_shared<Extension>(..)
+ * InferenceEngine::IExtension::Release is deprecated with no replacement
+ * Use IE_DEFINE_EXTENSION_CREATE_FUNCTION helper macro instead of explicit declaration of CreateExtension function, which create extension.
+
+ **Other changes**
+ * Version::ApiVersion structure is deprecated, Inference Engine does not have API version anymore
+ * LowLatency - use lowLatency2 instead
+ * CONFIG_KEY(DUMP_EXEC_GRAPH_AS_DOT) - use InferenceEngine::ExecutableNetwork::GetExecGraphInfo::serialize() instead
+ * Core::ImportNetwork with no device - pass device name explicitly.
+ * details::InferenceEngineException - use InferenceEngine::Exception and its derivatives instead.
+
 ## 2021.3

 ### New API
--- a/docs/IE_DG/Bfloat16Inference.md
+++ b/docs/IE_DG/Bfloat16Inference.md
@@ -1,50 +1,57 @@
 # Bfloat16 Inference {#openvino_docs_IE_DG_Bfloat16Inference}

-## Disclaimer
+## Bfloat16 Inference Usage (C++)

-Inference Engine with the bfloat16 inference implemented on CPU must support the native `avx512_bf16` instruction and therefore the bfloat16 data format.
-It is possible to use bfloat16 inference in simulation mode on platforms with Intel® Advanced Vector Extensions 512 (Intel® AVX-512), but it leads to significant performance degradation in comparison with FP32 or native `avx512_bf16` instruction usage.
+@sphinxdirective
+.. raw:: html

-## Introduction
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective

+### Disclaimer
+
+Inference Engine with the bfloat16 inference implemented on CPU must support the native *avx512_bf16* instruction and therefore the bfloat16 data format. It is possible to use bfloat16 inference in simulation mode on platforms with Intel® Advanced Vector Extensions 512 (Intel® AVX-512), but it leads to significant performance degradation in comparison with FP32 or native *avx512_bf16* instruction usage.
+
+### Introduction
 Bfloat16 computations (referred to as BF16) is the Brain Floating-Point format with 16 bits. This is a truncated 16-bit version of the 32-bit IEEE 754 single-precision floating-point format FP32. BF16 preserves 8 exponent bits as FP32 but reduces precision of the sign and mantissa from 24 bits to 8 bits.

 ![bf16_format]

-Preserving the exponent bits keeps BF16 to the same range as the FP32 (~1e-38 to ~3e38). This simplifies conversion between two data types: you just need to skip or flush to zero 16 low bits.
-Truncated mantissa leads to occasionally less precision, but according to [investigations](https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus), neural networks are more sensitive to the size of the exponent than the mantissa size. Also, in lots of models, precision is needed close to zero but not so much at the maximum range.
-Another useful feature of BF16 is possibility to encode INT8 in BF16 without loss of accuracy, because INT8 range completely fits in BF16 mantissa field. It reduces data flow in conversion from INT8 input image data to BF16 directly without intermediate representation in FP32, or in combination of [INT8 inference](Int8Inference.md) and BF16 layers.
+Preserving the exponent bits keeps BF16 to the same range as the FP32 (~1e-38 to ~3e38). This simplifies conversion between two data types: you just need to skip or flush to zero 16 low bits. Truncated mantissa leads to occasionally less precision, but according to [investigations](https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus), neural networks are more sensitive to the size of the exponent than the mantissa size. Also, in lots of models, precision is needed close to zero but not so much at the maximum range. Another useful feature of BF16 is possibility to encode INT8 in BF16 without loss of accuracy, because INT8 range completely fits in BF16 mantissa field. It reduces data flow in conversion from INT8 input image data to BF16 directly without intermediate representation in FP32, or in combination of [INT8 inference](Int8Inference.md) and BF16 layers.

-See the ["BFLOAT16 – Hardware Numerics Definition" white paper"](https://software.intel.com/sites/default/files/managed/40/8b/bf16-hardware-numerics-definition-white-paper.pdf) for more bfloat16 format details.
+See the [BFLOAT16 – Hardware Numerics Definition white paper](https://software.intel.com/content/dam/develop/external/us/en/documents/bf16-hardware-numerics-definition-white-paper.pdf) for more bfloat16 format details.

 There are two ways to check if CPU device can support bfloat16 computations for models:
-1. Query the instruction set via system `lscpu | grep avx512_bf16` or `cat /proc/cpuinfo | grep avx512_bf16`.
-2. Use [Query API](InferenceEngine_QueryAPI.md) with `METRIC_KEY(OPTIMIZATION_CAPABILITIES)`, which should return `BF16` in the list of CPU optimization options:
+
+1. Query the instruction set using one of these system commands:
+   * `lscpu | grep avx512_bf16`
+   * `cat /proc/cpuinfo | grep avx512_bf16`
+2. Use the [Query API](InferenceEngine_QueryAPI.md) with `METRIC_KEY(OPTIMIZATION_CAPABILITIES)`, which should return `BF16` in the list of CPU optimization options:

@snippet snippets/Bfloat16Inference0.cpp part0

-Current Inference Engine solution for bfloat16 inference uses Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN) and supports inference of the significant number of layers in BF16 computation mode.
+The current Inference Engine solution for bfloat16 inference uses the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN) and supports inference of the significant number of layers in BF16 computation mode.

-## Lowering Inference Precision
+### Lowering Inference Precision

-Lowering precision to increase performance is [widely used](https://software.intel.com/content/www/us/en/develop/articles/lower-numerical-precision-deep-learning-inference-and-training.html) for optimization of inference. The bfloat16 data type usage on CPU for the first time opens the possibility of default optimization approach.
-The embodiment of this approach is to use the optimization capabilities of the current platform to achieve maximum performance while maintaining the accuracy of calculations within the acceptable range.
+Lowering precision to increase performance is [widely used](https://software.intel.com/content/www/us/en/develop/articles/lower-numerical-precision-deep-learning-inference-and-training.html) for optimization of inference. The bfloat16 data type usage on CPU for the first time opens the possibility of default optimization approach. The embodiment of this approach is to use the optimization capabilities of the current platform to achieve maximum performance while maintaining the accuracy of calculations within the acceptable range.
+
+Using Bfloat16 precision provides the following performance benefits:

-Bfloat16 data usage provides the following benefits that increase performance:
 1. Faster multiplication of two BF16 numbers because of shorter mantissa of bfloat16 data.
 2. No need to support denormals and handling exceptions as this is a performance optimization.
 3. Fast conversion of float32 to bfloat16 and vice versa.
 4. Reduced size of data in memory, as a result, larger models fit in the same memory bounds.
 5. Reduced amount of data that must be transferred, as a result, reduced data transition time.

-For default optimization on CPU, source model is converted from FP32 or FP16 to BF16 and executed internally on platforms with native BF16 support. In this case, `KEY_ENFORCE_BF16` is set to `YES`.
-The code below demonstrates how to check if the key is set:
+For default optimization on CPU, the source model is converted from FP32 or FP16 to BF16 and executed internally on platforms with native BF16 support. In this case, `KEY_ENFORCE_BF16` is set to `YES` in the `PluginConfigParams` for `GetConfig()`. The code below demonstrates how to check if the key is set:

@snippet snippets/Bfloat16Inference1.cpp part1

-To disable BF16 internal transformations, set the `KEY_ENFORCE_BF16` to `NO`. In this case, the model infers as is without modifications with precisions that were set on each layer edge.
+To disable BF16 internal transformations in C++ API, set the `KEY_ENFORCE_BF16` to `NO`. In this case, the model infers as is without modifications with precisions that were set on each layer edge.

@snippet snippets/Bfloat16Inference2.cpp part2
+
 To disable BF16 in C API:

 ```
@@ -52,15 +59,16 @@ ie_config_t config = { "ENFORCE_BF16", "NO", NULL};
 ie_core_load_network(core, network, device_name, &config, &exe_network);
 ```

-An exception with message `Platform doesn't support BF16 format` is formed in case of setting `KEY_ENFORCE_BF16` to `YES` on CPU without native BF16 support or BF16 simulation mode.
+An exception with the message `Platform doesn't support BF16 format` is formed in case of setting `KEY_ENFORCE_BF16` to `YES` on CPU without native BF16 support or BF16 simulation mode.

-Low-Precision 8-bit integer models cannot be converted to BF16, even if bfloat16 optimization is set by default.         
+Low-Precision 8-bit integer models cannot be converted to BF16, even if bfloat16 optimization is set by default.

-## Bfloat16 Simulation Mode
+### Bfloat16 Simulation Mode

-Bfloat16 simulation mode is available on CPU and Intel® AVX-512 platforms that do not support the native `avx512_bf16` instruction. The simulator does not guarantee an adequate performance.
-To enable Bfloat16 simulator:
-* In [Benchmark App](../../inference-engine/samples/benchmark_app/README.md), add the `-enforcebf16=true` option
+Bfloat16 simulation mode is available on CPU and Intel® AVX-512 platforms that do not support the native `avx512_bf16` instruction. The simulator does not guarantee good performance. Note that the CPU must still support the AVX-512 extensions.
+
+To enable the simulation of Bfloat16:
+* In the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md), add the `-enforcebf16=true` option
 * In C++ API, set `KEY_ENFORCE_BF16` to `YES`
 * In C API:
 ```
@@ -68,25 +76,139 @@ ie_config_t config = { "ENFORCE_BF16", "YES", NULL};
 ie_core_load_network(core, network, device_name, &config, &exe_network);
 ```

-## Performance Counters
+### Performance Counters
+
+Information about layer precision is stored in the performance counters that are available from the Inference Engine API. The layers have the following marks:

-Information about layer precision is stored in the performance counters that are
-available from the Inference Engine API. The layers have the following marks:
 * Suffix `BF16` for layers that had bfloat16 data type input and were computed in BF16 precision
 * Suffix `FP32` for layers computed in 32-bit precision

 For example, the performance counters table for the Inception model can look as follows:

 ```
-pool5                         EXECUTED       layerType: Pooling            realTime: 143       cpu: 143             execType: jit_avx512_BF16
-fc6                           EXECUTED       layerType: FullyConnected     realTime: 47723     cpu: 47723           execType: jit_gemm_BF16
-relu6                         NOT_RUN        layerType: ReLU               realTime: 0         cpu: 0               execType: undef
-fc7                           EXECUTED       layerType: FullyConnected     realTime: 7558      cpu: 7558            execType: jit_gemm_BF16
-relu7                         NOT_RUN        layerType: ReLU               realTime: 0         cpu: 0               execType: undef
-fc8                           EXECUTED       layerType: FullyConnected     realTime: 2193      cpu: 2193            execType: jit_gemm_BF16
-prob                          EXECUTED       layerType: SoftMax            realTime: 68        cpu: 68              execType: jit_avx512_FP32
+pool5     EXECUTED       layerType: Pooling            realTime: 143       cpu: 143        execType: jit_avx512_BF16
+fc6       EXECUTED       layerType: FullyConnected     realTime: 47723     cpu: 47723      execType: jit_gemm_BF16
+relu6     NOT_RUN        layerType: ReLU               realTime: 0         cpu: 0          execType: undef
+fc7       EXECUTED       layerType: FullyConnected     realTime: 7558      cpu: 7558       execType: jit_gemm_BF16
+relu7     NOT_RUN        layerType: ReLU               realTime: 0         cpu: 0          execType: undef
+fc8       EXECUTED       layerType: FullyConnected     realTime: 2193      cpu: 2193       execType: jit_gemm_BF16
+prob      EXECUTED       layerType: SoftMax            realTime: 68        cpu: 68         execType: jit_avx512_FP32
 ```

-The `execType` column of the table includes inference primitives with specific suffixes.
+The **execType** column of the table includes inference primitives with specific suffixes.
+
+## Bfloat16 Inference Usage (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+### Disclaimer
+
+Inference Engine with the bfloat16 inference implemented on CPU must support the native *avx512_bf16* instruction and therefore the bfloat16 data format. It is possible to use bfloat16 inference in simulation mode on platforms with Intel® Advanced Vector Extensions 512 (Intel® AVX-512), but it leads to significant performance degradation in comparison with FP32 or native *avx512_bf16* instruction usage.
+
+### Introduction
+Bfloat16 computations (referred to as BF16) is the Brain Floating-Point format with 16 bits. This is a truncated 16-bit version of the 32-bit IEEE 754 single-precision floating-point format FP32. BF16 preserves 8 exponent bits as FP32 but reduces precision of the sign and mantissa from 24 bits to 8 bits.
+
+![bf16_format]
+
+Preserving the exponent bits keeps BF16 to the same range as the FP32 (~1e-38 to ~3e38). This simplifies conversion between two data types: you just need to skip or flush to zero 16 low bits. Truncated mantissa leads to occasionally less precision, but according to investigations, neural networks are more sensitive to the size of the exponent than the mantissa size. Also, in lots of models, precision is needed close to zero but not so much at the maximum range. Another useful feature of BF16 is possibility to encode INT8 in BF16 without loss of accuracy, because INT8 range completely fits in BF16 mantissa field. It reduces data flow in conversion from INT8 input image data to BF16 directly without intermediate representation in FP32, or in combination of [INT8 inference](Int8Inference.md) and BF16 layers.
+
+See the [BFLOAT16 – Hardware Numerics Definition white paper](https://software.intel.com/content/dam/develop/external/us/en/documents/bf16-hardware-numerics-definition-white-paper.pdf) for more bfloat16 format details.
+
+There are two ways to check if CPU device can support bfloat16 computations for models:
+
+1. Query the instruction set using one of these system commands:
+   * `lscpu | grep avx512_bf16`
+   * `cat /proc/cpuinfo | grep avx512_bf16`
+2. Use the Query API with METRIC_KEY(OPTIMIZATION_CAPABILITIES), which should return BF16 in the list of CPU optimization options:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(path_to_xml_file)
+cpu_caps = ie.get_metric(metric_name="OPTIMIZATION_CAPABILITIES", device_name="CPU")
+```
+
+The current Inference Engine solution for bfloat16 inference uses the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN) and supports inference of the significant number of layers in BF16 computation mode.
+
+### Lowering Inference Precision
+
+Lowering precision to increase performance is widely used for optimization of inference. The bfloat16 data type usage on CPU for the first time opens the possibility of default optimization approach. The embodiment of this approach is to use the optimization capabilities of the current platform to achieve maximum performance while maintaining the accuracy of calculations within the acceptable range.
+
+Using Bfloat16 precision provides the following performance benefits:
+
+1. Faster multiplication of two BF16 numbers because of shorter mantissa of bfloat16 data.
+2. No need to support denormals and handling exceptions as this is a performance optimization.
+3. Fast conversion of float32 to bfloat16 and vice versa.
+4. Reduced size of data in memory, as a result, larger models fit in the same memory bounds.
+5. Reduced amount of data that must be transferred, as a result, reduced data transition time.
+
+For default optimization on CPU, the source model is converted from FP32 or FP16 to BF16 and executed internally on platforms with native BF16 support. In this case, ENFORCE_BF16 is set to YES. The code below demonstrates how to check if the key is set:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(path_to_xml_file)
+exec_net = ie.load_network(network=net, device_name="CPU")
+exec_net.get_config("ENFORCE_BF16")
+```
+
+To enable BF16 internal transformations, set the key "ENFORCE_BF16" to "YES" in the ExecutableNetwork configuration.
+
+```python
+bf16_config = {"ENFORCE_BF16" : "YES"}
+exec_net = ie.load_network(network=net, device_name="CPU", config = bf16_config)
+```
+
+To disable BF16 internal transformations, set the key "ENFORCE_BF16" to "NO". In this case, the model infers as is without modifications with precisions that were set on each layer edge.
+
+An exception with the message `Platform doesn't support BF16 format` is formed in case of setting "ENFORCE_BF16" to "YES"on CPU without native BF16 support or BF16 simulation mode.
+
+Low-Precision 8-bit integer models cannot be converted to BF16, even if bfloat16 optimization is set by default.
+
+### Bfloat16 Simulation Mode
+
+Bfloat16 simulation mode is available on CPU and Intel® AVX-512 platforms that do not support the native avx512_bf16 instruction. The simulator does not guarantee good performance. Note that the CPU must still support the AVX-512 extensions.
+
+#### To Enable the simulation of Bfloat16:
+
+* In the Benchmark App, add the -enforcebf16=true option
+* In Python, use the following code as an example:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(path_to_xml_file)
+bf16_config = {"ENFORCE_BF16" : "YES"}
+exec_net = ie.load_network(network=net, device_name="CPU", config=bf16_config)
+```
+
+### Performance Counters
+
+Information about layer precision is stored in the performance counters that are available from the Inference Engine API. The layers have the following marks:
+
+* Suffix *BF16* for layers that had bfloat16 data type input and were computed in BF16 precision
+* Suffix *FP32* for layers computed in 32-bit precision
+
+For example, the performance counters table for the Inception model can look as follows:
+
+```
+pool5     EXECUTED       layerType: Pooling            realTime: 143       cpu: 143        execType: jit_avx512_BF16
+fc6       EXECUTED       layerType: FullyConnected     realTime: 47723     cpu: 47723      execType: jit_gemm_BF16
+relu6     NOT_RUN        layerType: ReLU               realTime: 0         cpu: 0          execType: undef
+fc7       EXECUTED       layerType: FullyConnected     realTime: 7558      cpu: 7558       execType: jit_gemm_BF16
+relu7     NOT_RUN        layerType: ReLU               realTime: 0         cpu: 0          execType: undef
+fc8       EXECUTED       layerType: FullyConnected     realTime: 2193      cpu: 2193       execType: jit_gemm_BF16
+prob      EXECUTED       layerType: SoftMax            realTime: 68        cpu: 68         execType: jit_avx512_FP32
+```
+
+
+The **execType** column of the table includes inference primitives with specific suffixes.

 [bf16_format]: img/bf16_format.png
--- a/docs/IE_DG/Deep_Learning_Inference_Engine_DevGuide.md
+++ b/docs/IE_DG/Deep_Learning_Inference_Engine_DevGuide.md
@@ -1,121 +1,52 @@
 # Inference Engine Developer Guide {#openvino_docs_IE_DG_Deep_Learning_Inference_Engine_DevGuide}

-> **NOTE:** [Intel® System Studio](https://software.intel.com/content/www/us/en/develop/tools/oneapi/commercial-base-iot.html) (click "Intel® System Studio Users" tab) is an all-in-one, cross-platform tool suite, purpose-built to simplify system bring-up and improve system and IoT device application performance on Intel® platforms. If you are using the Intel® Distribution of OpenVINO™ with Intel® System Studio, go to [Get Started with Intel® System Studio](https://software.intel.com/en-us/articles/get-started-with-openvino-and-intel-system-studio-2019).
+@sphinxdirective

-This Guide provides an overview of the Inference Engine describing the typical workflow for performing inference of a pre-trained and optimized deep learning model and a set of sample applications.
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   
+   openvino_docs_IE_DG_Integrate_with_customer_application_new_API
+   openvino_docs_deployment_optimization_guide_dldt_optimization_guide
+   openvino_docs_IE_DG_Device_Plugins
+   Direct ONNX Format Support <openvino_docs_IE_DG_ONNX_Support>
+   openvino_docs_IE_DG_Int8Inference
+   openvino_docs_IE_DG_Bfloat16Inference
+   openvino_docs_IE_DG_DynamicBatching
+   openvino_docs_IE_DG_ShapeInference
+   openvino_docs_IE_DG_Model_caching_overview
+   openvino_docs_IE_DG_Extensibility_DG_Intro
+   openvino_docs_IE_DG_Memory_primitives
+   openvino_docs_IE_DG_network_state_intro   
+   openvino_docs_IE_DG_API_Changes
+   openvino_docs_IE_DG_Known_Issues_Limitations
+   openvino_docs_IE_DG_Glossary
+      
+@endsphinxdirective

-> **NOTE:** Before you perform inference with the Inference Engine, your models should be converted to the Inference Engine format using the Model Optimizer or built directly in runtime using nGraph API. To learn about how to use Model Optimizer, refer to the [Model Optimizer Developer Guide](../MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). To learn about the pre-trained and optimized models delivered with the OpenVINO™ toolkit, refer to [Pre-Trained Models](@ref omz_models_group_intel).
+## Introduction
+Inference Engine is a set of C++ libraries with C and Python bindings providing a common API to deliver inference solutions on the platform of your choice. Use the Inference Engine API to read the Intermediate Representation (IR), ONNX and execute the model on devices.

-After you have used the Model Optimizer to create an Intermediate Representation (IR), use the Inference Engine to infer the result for a given input data.
+Inference Engine uses a plugin architecture. Inference Engine plugin is a software component that contains complete implementation for inference on a certain Intel® hardware device: CPU, GPU, VPU, etc. Each plugin implements the unified API and provides additional hardware-specific APIs.
+ 
+The scheme below illustrates the typical workflow for deploying a trained deep learning model: 

-Inference Engine is a set of C++ libraries providing a common API to deliver inference solutions on the platform of your choice: CPU, GPU, or VPU. Use the Inference Engine API to read the Intermediate Representation, set the input and output formats, and execute the model on devices. While the C++ libraries is the primary implementation, C libraries and Python bindings are also available.
+![](img/BASIC_FLOW_IE_C.svg)

-For Intel® Distribution of OpenVINO™ toolkit, Inference Engine binaries are delivered within release packages. 
+\\* _nGraph_ is the internal graph representation in the OpenVINO™ toolkit. Use it to [build a model from source code](https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_build_function.html).

-The open source version is available in the [OpenVINO™ toolkit GitHub repository](https://github.com/openvinotoolkit/openvino) and can be built for supported platforms using the <a href="https://github.com/openvinotoolkit/openvino/wiki/BuildingCode">Inference Engine Build Instructions</a>.    

-To learn about how to use the Inference Engine API for your application, see the [Integrating Inference Engine in Your Application](Integrate_with_customer_application_new_API.md) documentation.
+## Video

-For complete API Reference, see the [Inference Engine API References](./api_references.html) section.
+@sphinxdirective

-Inference Engine uses a plugin architecture. Inference Engine plugin is a software component that contains complete implementation for inference on a certain Intel&reg; hardware device: CPU, GPU, VPU, etc. Each plugin implements the unified API and provides additional hardware-specific APIs.
+.. list-table::

-## Modules in the Inference Engine component
-### Core Inference Engine Libraries
+   * - .. raw:: html

-Your application must link to the core Inference Engine libraries:
-* Linux* OS:
-    - `libinference_engine.so`, which depends on `libinference_engine_transformations.so`, `libtbb.so`, `libtbbmalloc.so` and `libngraph.so`
-* Windows* OS:
-    - `inference_engine.dll`, which depends on `inference_engine_transformations.dll`, `tbb.dll`, `tbbmalloc.dll` and `ngraph.dll`
-* macOS*:
-    - `libinference_engine.dylib`, which depends on `libinference_engine_transformations.dylib`, `libtbb.dylib`, `libtbbmalloc.dylib` and `libngraph.dylib`
-
-The required C++ header files are located in the `include` directory.
-
-This library contains the classes to:
-* Create Inference Engine Core object to work with devices and read network (InferenceEngine::Core)
-* Manipulate network information (InferenceEngine::CNNNetwork)
-* Execute and pass inputs and outputs (InferenceEngine::ExecutableNetwork and InferenceEngine::InferRequest)
-
-### Plugin Libraries to Read a Network Object
-
-Starting from 2020.4 release, Inference Engine introduced a concept of `CNNNetwork` reader plugins. Such plugins can be automatically dynamically loaded by Inference Engine in runtime depending on file format:
-* Linux* OS:
-    - `libinference_engine_ir_reader.so` to read a network from IR
-    - `libinference_engine_onnx_reader.so` to read a network from ONNX model format
-* Windows* OS:
-    - `inference_engine_ir_reader.dll` to read a network from IR
-    - `inference_engine_onnx_reader.dll` to read a network from ONNX model format
-
-### Device-Specific Plugin Libraries
-
-For each supported target device, Inference Engine provides a plugin — a DLL/shared library that contains complete implementation for inference on this particular device. The following plugins are available:
-
-| Plugin  | Device Type                   |
-| ------- | ----------------------------- |
-|CPU      |	Intel® Xeon® with Intel® AVX2 and AVX512, Intel® Core™ Processors with Intel® AVX2, Intel® Atom® Processors with Intel® SSE |
-|GPU      | Intel® Processor Graphics, including Intel® HD Graphics and Intel® Iris® Graphics |
-|MYRIAD   |	Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X |
-|GNA      |	Intel&reg; Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel&reg; Pentium&reg; Silver J5005 Processor, Intel&reg; Pentium&reg; Silver N5000 Processor, Intel&reg; Celeron&reg; J4005 Processor, Intel&reg; Celeron&reg; J4105 Processor, Intel&reg; Celeron&reg; Processor N4100, Intel&reg; Celeron&reg; Processor N4000, Intel&reg; Core&trade; i3-8121U Processor, Intel&reg; Core&trade; i7-1065G7 Processor, Intel&reg; Core&trade; i7-1060G7 Processor, Intel&reg; Core&trade; i5-1035G4 Processor, Intel&reg; Core&trade; i5-1035G7 Processor, Intel&reg; Core&trade; i5-1035G1 Processor, Intel&reg; Core&trade; i5-1030G7 Processor, Intel&reg; Core&trade; i5-1030G4 Processor, Intel&reg; Core&trade; i3-1005G1 Processor, Intel&reg; Core&trade; i3-1000G1 Processor, Intel&reg; Core&trade; i3-1000G4 Processor |
-|HETERO   | Automatic splitting of a network inference between several devices (for example if a device doesn't support certain layers|
-|MULTI    | Simultaneous inference of the same network on several devices in parallel|
-
-The table below shows the plugin libraries and additional dependencies for Linux, Windows and macOS platforms.
-
-| Plugin | Library name for Linux      | Dependency libraries for Linux                              | Library name for Windows | Dependency libraries for Windows                                                                       | Library name for macOS       | Dependency libraries for macOS              |
-|--------|-----------------------------|-------------------------------------------------------------|--------------------------|--------------------------------------------------------------------------------------------------------|------------------------------|---------------------------------------------|
-| CPU    | `libMKLDNNPlugin.so`        | `libinference_engine_lp_transformations.so`                 | `MKLDNNPlugin.dll`       | `inference_engine_lp_transformations.dll`                                                              | `libMKLDNNPlugin.so`      | `inference_engine_lp_transformations.dylib` |
-| GPU    | `libclDNNPlugin.so`         | `libinference_engine_lp_transformations.so`, `libOpenCL.so` | `clDNNPlugin.dll`        | `OpenCL.dll`, `inference_engine_lp_transformations.dll`                                                |  Is not supported            |  -                                          |
-| MYRIAD | `libmyriadPlugin.so`        | `libusb.so`,                                                | `myriadPlugin.dll`       | `usb.dll`                                                                                              | `libmyriadPlugin.so`      | `libusb.dylib`                              |
-| HDDL   | `libHDDLPlugin.so`          | `libbsl.so`, `libhddlapi.so`, `libmvnc-hddl.so`             | `HDDLPlugin.dll`         | `bsl.dll`, `hddlapi.dll`, `json-c.dll`, `libcrypto-1_1-x64.dll`, `libssl-1_1-x64.dll`, `mvnc-hddl.dll` |  Is not supported            |  -                                          |
-| GNA    | `libGNAPlugin.so`           | `libgna.so`,                                                | `GNAPlugin.dll`          | `gna.dll`                                                                                              |  Is not supported            |  -                                          |
-| HETERO | `libHeteroPlugin.so`        | Same as for selected plugins                                | `HeteroPlugin.dll`       | Same as for selected plugins                                                                           | `libHeteroPlugin.so`      |  Same as for selected plugins               |
-| MULTI  | `libMultiDevicePlugin.so`   | Same as for selected plugins                                | `MultiDevicePlugin.dll`  | Same as for selected plugins                                                                           | `libMultiDevicePlugin.so` |  Same as for selected plugins               |
-
-> **NOTE**: All plugin libraries also depend on core Inference Engine libraries.
-
-Make sure those libraries are in your computer's path or in the place you pointed to in the plugin loader. Make sure each plugin's related dependencies are in the:
-
-* Linux: `LD_LIBRARY_PATH`
-* Windows: `PATH`
-* macOS: `DYLD_LIBRARY_PATH`
-
-On Linux and macOS, use the script `bin/setupvars.sh` to set the environment variables.
-
-On Windows, run the `bin\setupvars.bat` batch file to set the environment variables.
-
-To learn more about supported devices and corresponding plugins, see the [Supported Devices](supported_plugins/Supported_Devices.md) chapter.
-
-## Common Workflow for Using the Inference Engine API
-
-The common workflow contains the following steps:
-
-1. **Create Inference Engine Core object** - Create an `InferenceEngine::Core` object to work with different devices, all device plugins are managed internally by the `Core` object. Register extensions with custom nGraph operations (`InferenceEngine::Core::AddExtension`).
-
-2. **Read the Intermediate Representation** - Using the `InferenceEngine::Core` class, read an Intermediate Representation file into an object of the `InferenceEngine::CNNNetwork` class. This class represents the network in the host memory.
-
-3. **Prepare inputs and outputs format** - After loading the network, specify input and output precision and the layout on the network. For these specification, use the `InferenceEngine::CNNNetwork::getInputsInfo()` and `InferenceEngine::CNNNetwork::getOutputsInfo()`.
-
-4. Pass per device loading configurations specific to this device (`InferenceEngine::Core::SetConfig`), and register extensions to this device (`InferenceEngine::Core::AddExtension`).
-
-5. **Compile and Load Network to device** - Use the `InferenceEngine::Core::LoadNetwork()` method with specific device (e.g. `CPU`, `GPU`, etc.) to compile and load the network on the device. Pass in the per-target load configuration for this compilation and load operation.
-
-6. **Set input data** - With the network loaded, you have an `InferenceEngine::ExecutableNetwork` object. Use this object to create an `InferenceEngine::InferRequest` in which you signal the input buffers to use for input and output. Specify a device-allocated memory and copy it into the device memory directly, or tell the device to use your application memory to save a copy.
-
-7. **Execute** - With the input and output memory now defined, choose your execution mode:
-
-    * Synchronously - `InferenceEngine::InferRequest::Infer()` method. Blocks until inference is completed.
-    * Asynchronously - `InferenceEngine::InferRequest::StartAsync()` method. Check status with the `InferenceEngine::InferRequest::Wait()` method (0 timeout), wait, or specify a completion callback.
-
-8. **Get the output** - After inference is completed, get the output memory or read the memory you provided earlier. Do this with the `InferenceEngine::IInferRequest::GetBlob()` method.
-
-## Video: Inference Engine Concept
-[![](https://img.youtube.com/vi/e6R13V8nbak/0.jpg)](https://www.youtube.com/watch?v=e6R13V8nbak)
-\htmlonly
-<iframe width="560" height="315" src="https://www.youtube.com/embed/e6R13V8nbak" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-\endhtmlonly
-
-## Further Reading
-
-For more details on the Inference Engine API, refer to the [Integrating Inference Engine in Your Application](Integrate_with_customer_application_new_API.md) documentation.
+           <iframe height="315" width="100%"
+           src="https://www.youtube.com/embed/e6R13V8nbak">
+           </iframe>
+   * - **Inference Engine Concept**. Duration: 3:43
+     
+@endsphinxdirective
--- a/docs/IE_DG/DynamicBatching.md
+++ b/docs/IE_DG/DynamicBatching.md
@@ -1,52 +1,106 @@
-Using Dynamic Batching {#openvino_docs_IE_DG_DynamicBatching}
-======================
+# Using Dynamic Batching {#openvino_docs_IE_DG_DynamicBatching}

-Dynamic Batching feature allows you to dynamically change batch size for inference calls
-within preset batch size limit.
-This feature might be useful when batch size is unknown beforehand, and using extra large batch size is
-undesired or impossible due to resource limitations.
-For example, face detection with person age, gender, or mood recognition is a typical usage scenario.
+## Using Dynamic Batching (C++)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective
+
+The Dynamic Batching feature allows you to dynamically change batch size for inference calls
+within a preset batch size limit. This feature might be useful when batch size is unknown beforehand and using an extra-large batch size is undesirable or impossible due to resource limitations. For example, applying face detection and then mood labeling to a video, you won't know in advance how many frames will contain a face when you pass inferencing results to a secondary model.


-## Usage
-
-You can activate Dynamic Batching by setting <code>KEY_DYN_BATCH_ENABLED</code> flag to <code>YES</code> in a configuration map that is
+You can activate Dynamic Batching by setting `KEY_DYN_BATCH_ENABLED` flag to `YES` in a configuration map that is
 passed to the plugin while loading a network.
-This configuration creates an <code>ExecutableNetwork</code> object that will allow setting batch size
-dynamically in all of its infer requests using <code>SetBatch()</code> method.
-The batch size that was set in passed <code>CNNNetwork</code> object will be used as a maximum batch size limit.
+This configuration creates an `ExecutableNetwork` object that will allow setting batch size
+dynamically in all of its infer requests using `SetBatch()` method.
+The batch size that was set in the passed `CNNNetwork` object will be used as a maximum batch size limit.

 Here is a code example:

@snippet snippets/DynamicBatching.cpp part0


-## Limitations
+### Limitations

-Currently, certain limitations for using Dynamic Batching exist:
+Currently, there are certain limitations for the use of Dynamic Batching exist:

 * Use Dynamic Batching with CPU and GPU plugins only.
-
 * Use Dynamic Batching on topologies that consist of certain layers only:
+   * Convolution
+   * Deconvolution
+   * Activation
+   * LRN
+   * Pooling
+   * FullyConnected
+   * SoftMax
+   * Split
+   * Concatenation
+   * Power
+   * Eltwise
+   * Crop
+   * BatchNormalization
+   * Copy

-	* Convolution
-	* Deconvolution
-	* Activation
-	* LRN
-	* Pooling
-	* FullyConnected
-	* SoftMax
-	* Split
-	* Concatenation
-	* Power
-	* Eltwise
-	* Crop
-	* BatchNormalization
-	* Copy
-	
-Do not use layers that might arbitrary change tensor shape (such as Flatten, Permute, Reshape),
-layers specific to object detection topologies (ROIPooling, ProirBox, DetectionOutput), and
-custom layers.
-Topology analysis is performed during the process of loading a network into plugin, and if topology is
-not applicable, an exception is generated.
+The following types of layers are not supported:

+* Layers that might arbitrary change tensor shape (such as Flatten, Permute, Reshape)
+* Layers specific to object detection topologies (ROIPooling, ProirBox, DetectionOutput)
+* Custom layers
+
+Topology analysis is performed during the process of loading a network into plugin, and if the topology is not supported, an exception is generated.
+
+## Using Dynamic Batching (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+Dynamic Batching is a feature that allows you to dynamically change batch size for inference calls within a preset batch size limit. This feature might be useful when batch size is unknown beforehand, and using extra large batch size is not desired or impossible due to resource limitations. For example, face detection with person age, gender, or mood recognition is a typical usage scenario.
+
+You can activate Dynamic Batching by setting the "DYN_BATCH_ENABLED" flag to "YES" in a configuration map that is passed to the plugin while loading a network. This configuration creates an `ExecutableNetwork` object that will allow setting batch size dynamically in all of its infer requests using the  [ie_api.batch_size](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.batch_size) method. The batch size that was set in the passed CNNNetwork object will be used as a maximum batch size limit.
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+dyn_config = {"DYN_BATCH_ENABLED": "YES"}
+ie.set_config(config=dyn_config, device_name=device)
+# Read a network in IR or ONNX format
+net = ie.read_network(path_to_model)
+net.batch_size = 32  # set the maximum batch size to 32
+exec_net = ie.load_network(network=net, device_name=device)
+```
+
+### Limitations
+
+Currently, certain limitations for the use of Dynamic Batching exist:
+
+* Use Dynamic Batching with CPU and GPU plugins only.
+* Use Dynamic Batching on topologies that consist of certain layers only:
+   * Convolution
+   * Deconvolution
+   * Activation
+   * LRN
+   * Pooling
+   * FullyConnected
+   * SoftMax
+   * Split
+   * Concatenation
+   * Power
+   * Eltwise
+   * Crop
+   * BatchNormalization
+   * Copy
+
+The following types of layers are not supported:
+
+* Layers that might arbitrary change tensor shape (such as Flatten, Permute, Reshape)
+* Layers specific to object detection topologies (ROIPooling, ProirBox, DetectionOutput)
+* Custom layers
+
+Topology analysis is performed during the process of loading a network into plugin, and if the topology is not supported, an exception is generated.
--- a/docs/IE_DG/Extensibility_DG/AddingNGraphOps.md
+++ b/docs/IE_DG/Extensibility_DG/AddingNGraphOps.md
@@ -1,7 +1,9 @@
-# Custom nGraph Operation {#openvino_docs_IE_DG_Extensibility_DG_AddingNGraphOps}
+# Custom nGraph Operations {#openvino_docs_IE_DG_Extensibility_DG_AddingNGraphOps}

 Inference Engine Extension API allows you to register operation sets (opsets) with custom nGraph operations to support models with operations which OpenVINO™ does not support out-of-the-box.

+Besides creating custom nGraph operations, to [support custom operations](../../HOWTO/Custom_Layers_Guide.md) in your model you must also create a Model Optimizer extension for the custom operations and an Inference Engine device plugin extension for the device you will use for inference.
+
 ## Operation Class

 To add your custom nGraph operation, create a new class that extends `ngraph::Op`, which is in turn derived from `ngraph::Node`, the base class for all graph operations in nGraph. Follow the steps below to add a custom nGraph operation:
@@ -26,8 +28,8 @@ Based on that, declaration of an operation class can look as follows:

 The provided implementation has several fields:

- * `add` of type `int64_t` is an attribute of a custom operation.
- * `type_info` of type `ngraph::NodeTypeInfo` defines the type and version of an operation.
+ * `add` of type `int64_t` is an attribute of a custom operation
+ * `type_info` of type `ngraph::NodeTypeInfo` defines type and version of an operation

 ### Operation Constructors

@@ -67,14 +69,13 @@ To add custom operations to the [Extension](Extension.md) class, create an opera

@snippet template_extension/extension.cpp extension:getOpSets

-This method returns a map of opsets that exist in the extension library.
-
-nGraph provides an opset mechanism to group operations into clusters. S. Different opsets distinguish between different versions of one operation.
+This method returns a map of opsets that exist in the [extension library](Extension.md). 
+nGraph provides an opset mechanism to group operations into clusters. Different opsets distinguish between different versions of one operation.

 When specifying opset names, follow the rules below:
 * Use unique opset names.
 * Do not use the following built-in opset names: `extension`, `experimental`, `opset1`, `opset2`, `opset3`, ... , `opsetN`.
-* Make sure that the Model Optimizer and your extension use the same opset names.
+* [Make sure that the Model Optimizer](../../HOWTO/Custom_Layers_Guide.md) and your extension use the same opset names.
 * IR v10 operations have the mandatory `version` attribute specifying the opset.
 Operations from the default opset cannot be redefined.

--- a/docs/IE_DG/Extensibility_DG/Building.md
+++ b/docs/IE_DG/Extensibility_DG/Building.md
@@ -2,13 +2,13 @@

 Inference Engine build infrastructure provides the Inference Engine Package for application development.

-To build an extension library, use the following CMake script:
+To configure the build of your extension library, use the following CMake script:

@snippet template_extension/CMakeLists.txt cmake:extension

 This CMake script finds the Inference Engine and nGraph using the `find_package` CMake command.

-To build an extension library, run the commands below:
+To build the extension library, run the commands below:

 ```sh
 $ cd template_extension
--- a/docs/IE_DG/Extensibility_DG/CPU_Kernel.md
+++ b/docs/IE_DG/Extensibility_DG/CPU_Kernel.md
@@ -1,6 +1,8 @@
-# How to Implement Custom CPU Operations {#openvino_docs_IE_DG_Extensibility_DG_CPU_Kernel}
+# CPU Kernel Custom Operations {#openvino_docs_IE_DG_Extensibility_DG_CPU_Kernel}

-The primary means of the performance of the CPU codepath in the Inference Engine is the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN), and new CPU kernels extend the Inference Engine plugin for the Intel MKL-DNN. Implementing the InferenceEngine::ILayerExecImpl defines a general CPU-side extension. There are no Intel MKL-DNN specifics in the way you need to implement a kernel.
+To enable operations not supported by OpenVINO™ out of the box, you need a custom extension for Model Optimizer, a custom nGraph operation set, and a custom kernel for the device you will target. This page describes custom kernel support for the CPU device.
+
+The primary means of the performance of the CPU codepath in the Inference Engine is the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN), and new CPU kernels extend the Inference Engine plugin for the Intel MKL-DNN. Implementing the InferenceEngine::ILayerExecImpl API call defines a general CPU-side extension. There are no Intel MKL-DNN specifics in the way you need to implement a kernel.

 ## Implementation Class

@@ -20,31 +22,32 @@ The provided implementation has several fields:

 ### Constructor of Implementation

-An implementation constructor checks parameters of an nGraph operation, stores required attributes, and stores an error message in the case of an error.
+An implementation constructor checks parameters of an nGraph operation, stores required attributes, and stores an error message in case of an error.

@snippet template_extension/cpu_kernel.cpp cpu_implementation:ctor

 ### `getSupportedConfigurations`

-InferenceEngine::ILayerExecImpl::getSupportedConfigurations method returns all supported configuration formats (input/output tensor layouts) for your implementation. To specify formats of data, use InferenceEngine::TensorDesc. Refer to the [Memory Primitives](../Memory_primitives.md) section for instructions.
+The InferenceEngine::ILayerExecImpl::getSupportedConfigurations method returns all supported configuration formats (input/output tensor layouts) for your implementation. To specify formats of data, use InferenceEngine::TensorDesc. Refer to the [Memory Primitives](../Memory_primitives.md) section for instructions.

@snippet template_extension/cpu_kernel.cpp cpu_implementation:getSupportedConfigurations

 ### `init`

-InferenceEngine::ILayerExecImpl::init method gets a runtime-selected configuration from a vector that is populated from the `getSupportedConfigurations` method and checks the parameters:
+The InferenceEngine::ILayerExecImpl::init method gets a runtime-selected configuration from a vector that is populated from the `getSupportedConfigurations` method and checks the parameters:

@snippet template_extension/cpu_kernel.cpp cpu_implementation:init

 ### `execute`

-InferenceEngine::ILayerExecImpl::execute method accepts and processes the actual tenors as input/output blobs:
+The InferenceEngine::ILayerExecImpl::execute method accepts and processes the actual tensors as input/output blobs:

@snippet template_extension/cpu_kernel.cpp cpu_implementation:execute

 ## Register Implementation in `Extension` Class

 To register custom kernel implementation in the [Extension](Extension.md) class, implement the following methods:
+
 * <a href="#getImpTypes">getImplTypes</a>
 * <a href="#getImplementation">getImplementation</a>

@@ -66,4 +69,3 @@ InferenceEngine::IExtension::getImplementation returns the kernel implementation
 Use the `AddExtension` method of the general plugin interface to load your primitives:

@snippet snippets/CPU_Kernel.cpp part0
-
--- a/docs/IE_DG/Extensibility_DG/Custom_ONNX_Ops.md
+++ b/docs/IE_DG/Extensibility_DG/Custom_ONNX_Ops.md
@@ -1,7 +1,7 @@
 # Custom ONNX* Operators {#openvino_docs_IE_DG_Extensibility_DG_Custom_ONNX_Ops}

 The ONNX\* importer provides a mechanism to register custom ONNX operators based on predefined or custom nGraph operations.
-The function responsible for registering a new operator is called `ngraph::onnx_import::register_operator` and is defined in `onnx_import/onnx_utils.hpp`.
+The function responsible for registering a new operator is called `ngraph::onnx_import::register_operator` and is defined in [`onnx_import/onnx_utils.hpp`](https://docs.openvinotoolkit.org/latest/ngraph_cpp_api/onnx__utils_8hpp_source.html).

 ## Register Custom ONNX Operator Based on Predefined nGraph Operations

@@ -14,18 +14,22 @@ x <  0 => f(x) = x * beta
 where `alpha` and `beta` are float constants.

 1. Include headers:
+
@snippet onnx_custom_op/onnx_custom_op.cpp onnx_custom_op:headers

 2. Register the CustomRelu operator in the ONNX importer:
+
@snippet onnx_custom_op/onnx_custom_op.cpp onnx_custom_op:register_operator
+
 The `register_operator` function takes four arguments: op_type, opset version, domain, and a function object.
 The function object is a user-defined function that takes `ngraph::onnx_import::Node` as an input and based on that, returns a graph with nGraph operations.
-The `ngraph::onnx_import::Node` class represents a node in an ONNX model. It provides functions to fetch input node(s) using `get_ng_inputs`, attribute value using `get_attribute_value`, and many more. See `onnx_import/core/node.hpp` for full class declaration.
+The `ngraph::onnx_import::Node` class represents a node in an ONNX model. It provides functions to fetch input node(s) using `get_ng_inputs`, attribute value using `get_attribute_value`, and many more. See [`onnx_import/core/node.hpp`](https://docs.openvinotoolkit.org/latest/ngraph_cpp_api/core_2include_2ngraph_2node_8hpp_source.html) for full class declaration.

 New operator registration must happen before an ONNX model is read. For example, if an model uses the `CustomRelu` operator, call `register_operator("CustomRelu", ...)` before InferenceEngine::Core::ReadNetwork.
 Reregistering ONNX operators within the same process is supported. If you register an existing operator, you get a warning.

 The example below demonstrates an exemplary model that requires a previously created `CustomRelu` operator:
+
@snippet onnx_custom_op/onnx_custom_op.cpp onnx_custom_op:model


@@ -33,27 +37,30 @@ To create a graph with nGraph operations, visit [Custom nGraph Operations](Addin
 For a complete list of predefined nGraph operators, visit [Available Operations Sets](../../ops/opset.md).

 If you do not need an operator anymore, unregister it by calling `unregister_operator`. The function takes three arguments: `op_type`, `version`, and `domain`.
+
@snippet onnx_custom_op/onnx_custom_op.cpp onnx_custom_op:unregister_operator

 ## Register Custom ONNX Operator Based on Custom nGraph Operations

 The same principles apply when registering a custom ONNX operator based on custom nGraph operations.
-This example shows how to register a custom ONNX operator based on `Operation` presented in [this tutorial](AddingNGraphOps.md), which is used in [TemplateExtension](Extension.md).
+This example shows how to register a custom ONNX operator based on `Operation` presented in [this tutorial](AddingNGraphOps.md), which is used in [TemplateExtension](Extension.md):
+
@snippet template_extension/extension.cpp extension:ctor

 Here, the `register_operator` function is called in the constructor of Extension. The constructor makes sure that the function is called before InferenceEngine::Core::ReadNetwork, because InferenceEngine::Core::AddExtension must be called before a model with a custom operator is read.

 The example below demonstrates how to unregister an operator from the destructor of Extension:
+
@snippet template_extension/extension.cpp extension:dtor

 > **REQUIRED**: It is mandatory to unregister a custom ONNX operator if it is defined in a dynamic shared library.

 ## Requirements for Building with CMake

-A program that uses the `register_operator` functionality requires `ngraph` and `onnx_importer` libraries in addition to the Inference Engine.
-The `onnx_importer` is a component of the `ngraph` package , so `find_package(ngraph REQUIRED COMPONENTS onnx_importer)` can find both.
-The `ngraph` package exposes two variables, `${NGRAPH_LIBRARIES}` and `${ONNX_IMPORTER_LIBRARIES}`, which reference the `ngraph` and `onnx_importer` libraries.
-Those variables need to be passed to the `target_link_libraries` command in the CMakeLists.txt file.
+A program that uses the `register_operator` functionality requires `ngraph::ngraph` and `ngraph::onnx_ngraph_frontend` libraries in addition to the Inference Engine.
+The `onnx_ngraph_frontend` is a component of the `ngraph` package, so `find_package(ngraph REQUIRED COMPONENTS onnx_ngraph_frontend)` can find both.
+Those libraries need to be passed to the `target_link_libraries` command in the CMakeLists.txt file.

 See CMakeLists.txt below for reference:
+
@snippet onnx_custom_op/CMakeLists.txt cmake:onnx_custom_op
--- a/docs/IE_DG/Extensibility_DG/GPU_Kernel.md
+++ b/docs/IE_DG/Extensibility_DG/GPU_Kernel.md
@@ -1,15 +1,17 @@
 # How to Implement Custom GPU Operations {#openvino_docs_IE_DG_Extensibility_DG_GPU_Kernel}

-The GPU codepath abstracts many details about OpenCL\*. You need to provide the kernel code in OpenCL C and the configuration file that connects the kernel and its parameters to the parameters of the operation.
+To enable operations not supported by OpenVINO™ out of the box, you need a custom extension for Model Optimizer, a custom nGraph operation set, and a custom kernel for the device you will target. This page describes custom kernel support for the GPU device.

-There are two options of using the custom operation configuration file:
+The GPU codepath abstracts many details about OpenCL\*. You need to provide the kernel code in OpenCL C and an XML configuration file that connects the kernel and its parameters to the parameters of the operation.
+
+There are two options for using the custom operation configuration file:

 * Include a section with your kernels into the global automatically-loaded `cldnn_global_custom_kernels/cldnn_global_custom_kernels.xml` file, which is hosted in the `<INSTALL_DIR>/deployment_tools/inference_engine/bin/intel64/{Debug/Release}` folder
 * Call the `InferenceEngine::Core::SetConfig()` method from your application with the `InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE` key and the configuration file name as a value before loading the network that uses custom operations to the plugin:

@snippet snippets/GPU_Kernel.cpp part0

-All Inference Engine samples, except the trivial `hello_classification`,
+All Inference Engine samples, except the trivial `hello_classification`, and most Open Model Zoo demos 
 feature a dedicated command-line option `-c` to load custom kernels. For example, to load custom operations for the classification sample, run the command below:
 ```sh
 $ ./classification_sample -m <path_to_model>/bvlc_alexnet_fp16.xml -i ./validation_set/daily/227x227/apron.bmp -d GPU
@@ -132,8 +134,8 @@ queuing an OpenCL program for execution.

 ## Example Configuration File

-The following code sample provides an example configuration file in the
-`.xml` format. For information on the configuration file structure, see
+The following code sample provides an example configuration file in XML 
+format. For information on the configuration file structure, see
 [Configuration File Format](#config-file-format).
 ```xml
 <CustomLayer name="ReLU" type="SimpleGPU" version="1">
@@ -208,12 +210,12 @@ __kernel void example_relu_kernel(
 }
 ```

-> **NOTE:** As described in the previous section, all things like
+> **NOTE**: As described in the previous section, all items like
 > `INPUT0_TYPE` are actually defined as OpenCL (pre-)compiler inputs by
 > the Inference Engine for efficiency reasons. See [Debugging
 > Tips](#debugging-tips) for information on debugging the results.

-> **NOTE**: Several GPU-targeted kernels are also added to the binaries upon samples compilation
+> **NOTE**: Several GPU-targeted kernels are also added to the binaries upon compilation of samples
 > so that the sample application can easy load them.
 > Refer to the `cldnn_global_custom_kernels` folder in the GPU plugin installation directory.

@@ -221,10 +223,11 @@ __kernel void example_relu_kernel(

 * **Using `printf` in the OpenCL™ Kernels**.
 To debug the specific values, you can use `printf` in your kernels.
-However, be careful: for instance, do not output excessively
-as it would generate too much data. The `printf` output is typical, so
+However, be careful not to output excessively, which
+could generate too much data. The `printf` output is typical, so
 your output can be truncated to fit the buffer. Also, because of
 buffering, you actually get an entire buffer of output when the
 execution ends.<br>
+
 For more information, refer to the [printf
 Function](https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/printfFunction.html).
--- a/docs/IE_DG/Extensibility_DG/Intro.md
+++ b/docs/IE_DG/Extensibility_DG/Intro.md
@@ -1,27 +1,39 @@
 # Inference Engine Extensibility Mechanism {#openvino_docs_IE_DG_Extensibility_DG_Intro}

-Inference Engine Extensibility API enables you to add support of custom operations to the Inference Engine.
-Extension should contain operation sets with custom operations and execution kernels for custom operations.
-Physically, an extension library can be represented as a dynamic library exporting the single `CreateExtension` function
-that creates a new extension instance.
+@sphinxdirective

-To load the Extensibility library to the `InferenceEngine::Core` object, use the
-`InferenceEngine::Core::AddExtension` method.
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   
+   openvino_docs_IE_DG_Extensibility_DG_AddingNGraphOps
+   openvino_docs_IE_DG_Extensibility_DG_Custom_ONNX_Ops
+   CPU Kernels Extensibility <openvino_docs_IE_DG_Extensibility_DG_CPU_Kernel>
+   GPU Kernels Extensibility <openvino_docs_IE_DG_Extensibility_DG_GPU_Kernel>
+   VPU Kernels Extensibility <openvino_docs_IE_DG_Extensibility_DG_VPU_Kernel>
+   openvino_docs_IE_DG_Extensibility_DG_Extension
+   openvino_docs_IE_DG_Extensibility_DG_Building
+
+@endsphinxdirective
+
+If your model contains operations not normally supported by OpenVINO, the Inference Engine Extensibility API lets you add support for those custom operations in a library containing custom nGraph operation sets, corresponding extensions to the Model Optimizer, and a device plugin extension. See the overview in the [Custom Operations Guide](../../HOWTO/Custom_Layers_Guide.md) to learn how these work together.
+
+To load the Extensibility library to the `InferenceEngine::Core` object, use the `InferenceEngine::Core::AddExtension` method.

 ## Inference Engine Extension Library

-Inference Engine Extension dynamic library contains the following components:
+An Inference Engine Extension dynamic library contains the following components:

 * [Extension Library](Extension.md):
-    - Contains custom operation sets.
-    - Provides CPU implementations for custom operations.
+    - Contains custom operation sets
+    - Provides CPU implementations for custom operations
 * [Custom nGraph Operation](AddingNGraphOps.md):
    - Enables the use of `InferenceEngine::Core::ReadNetwork` to read Intermediate Representation (IR) with unsupported
-    operations.
-    - Enables the creation of `ngraph::Function` with unsupported operations.
-    - Provides a shape inference mechanism for custom operations.
+    operations
+    - Enables the creation of `ngraph::Function` with unsupported operations
+    - Provides a shape inference mechanism for custom operations

-> **NOTE**: This documentation is written based on the `Template extension`, which demonstrates extension development details. Find the complete code of the `Template extension`, which is fully compilable and up-to-date, at `<dldt source tree>/docs/template_extension`.
+> **NOTE**: This documentation is written based on the [Template extension](https://github.com/openvinotoolkit/openvino/tree/master/docs/template_extension), which demonstrates extension development details. You can review the complete code, which is fully compilable and up-to-date, to see how it works.

 ## Execution Kernels

--- a/docs/IE_DG/Extensibility_DG/VPU_Kernel.md
+++ b/docs/IE_DG/Extensibility_DG/VPU_Kernel.md
@@ -1,10 +1,12 @@
 # How to Implement Custom Layers for VPU (Intel® Neural Compute Stick 2) {#openvino_docs_IE_DG_Extensibility_DG_VPU_Kernel}

+To enable operations not supported by OpenVINO™ out of the box, you need a custom extension for Model Optimizer, a custom nGraph operation set, and a custom kernel for the device you will target. This page describes custom kernel support for one the VPU, the Intel® Neural Compute Stick 2 device, which uses the MYRIAD device plugin.
+
 > **NOTES:** 
 > * OpenCL\* custom layer support is available in the preview mode.
 > * This section assumes you are familiar with developing kernels using OpenCL.

-To customize your topology with an OpenCL layer, follow the steps below:
+To customize your topology with an OpenCL layer, carry out the tasks described on this page:

 1. Write and compile your OpenCL code with the standalone offline OpenCL compiler (`clc`).
 2. Write a configuration file to bind the OpenCL kernel to the topology file (`.xml`) of the model IR.
@@ -12,12 +14,12 @@ To customize your topology with an OpenCL layer, follow the steps below:

 ## Compile OpenCL code for VPU (Intel® Neural Compute Stick 2)

-> **NOTE:** OpenCL compiler, targeting Intel® Neural Compute Stick 2 for the SHAVE* processor only, is redistributed with OpenVINO.
-OpenCL support is provided by ComputeAorta*, and is distributed under a license agreement between Intel® and Codeplay* Software Ltd.
+> **NOTE**: OpenCL compiler, targeting Intel® Neural Compute Stick 2 for the SHAVE* processor only, is redistributed with OpenVINO.
+OpenCL support is provided by ComputeAorta* and is distributed under a license agreement between Intel® and Codeplay* Software Ltd.

 The OpenCL toolchain for the Intel® Neural Compute Stick 2 supports offline compilation only, so first compile OpenCL C code using the standalone `clc` compiler. You can find the compiler binary at `<INSTALL_DIR>/deployment_tools/tools/cl_compiler`.

-> **NOTE:** By design, custom OpenCL layers support any OpenCL kernels written with 1.2 version assumed. It also supports half float extension and is optimized for this type, because it is a native type for Intel® Movidius™ VPUs.
+> **NOTE**: By design, custom OpenCL layers support any OpenCL kernels written assuming OpenCL version 1.2. It also supports half float extension and is optimized for this type, because it is a native type for Intel® Movidius™ VPUs.

 1. Prior to running a compilation, make sure that the following variables are set:
   * `SHAVE_MA2X8XLIBS_DIR=<INSTALL_DIR>/deployment_tools/tools/cl_compiler/lib/`
@@ -25,19 +27,19 @@ The OpenCL toolchain for the Intel® Neural Compute Stick 2 supports offline com
   * `SHAVE_MYRIAD_LD_DIR=<INSTALL_DIR>/deployment_tools/tools/cl_compiler/bin/`
   * `SHAVE_MOVIASM_DIR=<INSTALL_DIR>/deployment_tools/tools/cl_compiler/bin/`
 2. Run the compilation with the command below. You should use `--strip-binary-header` to make an OpenCL runtime-agnostic binary runnable with the Inference Engine.
-```bash
-cd <INSTALL_DIR>/deployment_tools/tools/cl_compiler/bin
-./clc --strip-binary-header custom_layer.cl -o custom_layer.bin
-```
+   ```bash
+   cd <INSTALL_DIR>/deployment_tools/tools/cl_compiler/bin
+   ./clc --strip-binary-header custom_layer.cl -o custom_layer.bin
+   ```

 ## Write a Configuration File

 To tie the topology IR for a layer you customize, prepare a configuration file, so that the Inference Engine can find parameters for your kernel and the execution work grid is described.
-For example, given the following OpenCL kernel signature:
+For example, consider the following OpenCL kernel signature:
 ```cpp
 __kernel void reorg_nhwc(__global const half *src, __global half *out, int w, int h, int c, int stride);
 ```
-Configuration file for this kernel might be the following:
+A configuration file for this kernel might be the following:
 ```xml
 <CustomLayer name="ReorgYolo" type="MVCL" version="1">
   <Kernel entry="reorg_nhwc">
@@ -62,7 +64,7 @@ Each custom layer is described with the `CustomLayer` node. It has the following
  - Sub-node `Kernel` must contain the following attributes:
    - `entry` – The name of your kernel function as you defined it in a source file. In the example above, it is `reorg_nhwc`.
    - Node `Source` must contain the following attributes:
-      - `filename` – The path to a compiled binary relative to the `.xml` binding file.
+      - `filename` – The path to a compiled binary relative to the XML configuration file.
  - Sub-node `Parameters` – Describes parameters bindings. For more information, see the description below.
  - Sub-node `WorkSizes` – Describes local and global work group sizes and the source for dimension deduction as a pair `direction,port`. In the example above, the work group is described relatively to the dimension of the input tensor that comes through port 0 in the IR. `global` and `local` work group configurations support any simple math expressions with +,-,\*,/, and () from `B`(batch), `Y`(height), `X`(width) and `F`(channels).
  - Sub-node `Where` – Allows to customize bindings with the `key="value"` attribute. For example, to substitute only 3x3 convolutions, write `<Where kernel="3,3"/>` in the binding xml.
@@ -70,8 +72,8 @@ Each custom layer is described with the `CustomLayer` node. It has the following
  Parameter description supports `Tensor` of one of tensor types such as `input`, `output`, `input_buffer`, `output_buffer` or `data`, `Scalar`, or `Data` nodes and has the following format:
  - Each `Tensor` node of `input` or `output` type must contain the following attributes:
    - `arg-name` – The name of a kernel parameter in the kernel signature.
-    - `type` – Node type: `input` or `output` as in the IR.
-    - `port-index` – A number of input/output ports as in the IR.
+    - `type` – Node type: `input` or `output` as specified in the IR.
+    - `port-index` – A number of input/output ports as specified in the IR.
    - `format` – The channel order in the tensor. Optional conversion layers are generated if the custom layer format is not compatible with formats of neighboring layers. `BFXY`, `BYXF`, and `ANY` formats are supported currently.
  - Each `Tensor` node of `input_buffer` or `output_buffer` type must contain the following attributes:
    - `arg-name` – The name of a kernel parameter in the kernel signature.
@@ -417,7 +419,7 @@ This decreases the execution time up to 40% against the best performing vectoriz
 stalls completely on memory access without any prefetch. The same recommendation is applicable for scalar load/store
 from/to a `__blobal` pointer since work-group copying could be done in a vector fashion.

-10. Use a manual DMA extension. Local (on-chip) memory throughput is up to 24x higher than DDR throughput. Starting from OpenVINO™ 2020.1, VPU OpenCL features manual-DMA kernel extension to copy sub-tensor used by work group into local memory and performing compute without DDR evolved. Here is the simple GRN kernel implementation that runs over DDR. Local size is equal to (width of the input tensor, 1, 1) to define a large enough work group to get code automatically vectorized and unrolled, while global size is (width of the input tensor, height of the input tensor, 1):
+10. Use a manual DMA extension. Local (on-chip) memory throughput is up to 24x higher than DDR throughput. Starting from OpenVINO™ 2020.1, VPU OpenCL features manual-DMA kernel extension to copy sub-tensor used by work group into local memory and performing compute without DDR evolved. Here is the simple GRN kernel implementation that runs over DDR. Local size is in the form (width of the input tensor, 1, 1) to define a large enough work group to get code automatically vectorized and unrolled, while global size is (width of the input tensor, height of the input tensor, 1):
   ```cpp
   __kernel void grn_NCHW(
     __global const half* restrict src_data,
@@ -444,7 +446,9 @@ from/to a `__blobal` pointer since work-group copying could be done in a vector
     }
   }
   ```
+   
 This kernel can be rewritten to introduce special data binding `__dma_preload` and `__dma_postwrite intrinsics`. This means that instead of one kernel, a group of three kernels should be implemented: `kernelName`, `__dma_preload_kernelName`, and `__dma_postwrite_kernelName`.  `__dma_preload_kernelName` for a particular work group `n` is guaranteed to be executed before the `n`-th work group itself, while `__dma_postwrite_kernelName` is guaranteed to be executed after a corresponding work group. You can define one of those functions that are intended to be used to copy data from-to `__global` and `__local` memory. The syntactics requires exact functional signature match. The example below illustrates how to prepare your kernel for manual-DMA.
+
   ```cpp
   __kernel void __dma_preload_grn_NCHW(
     __global const half* restrict src,
@@ -453,7 +457,7 @@ This kernel can be rewritten to introduce special data binding `__dma_preload` a
     __local        half* restrict local_dst,
     int C,
     float bias)
-   {
+     {
     // ToDO: copy required piece of src tensor into local_src
   }
   
@@ -478,9 +482,9 @@ This kernel can be rewritten to introduce special data binding `__dma_preload` a
   {
     // same as the example above
   }
-   ```
-GRN kernel operates on channel-major tensors to compute average over full channel range and then normalizes input elements to produce the output.
-As a part of manual DMA extension, a group of work group copy functions are introduced in addition to `async_work_group_copy`, which is also mapped to DMA call.
+   ``` 
+The GRN kernel operates on channel-major tensors to compute average over full channel range and then normalizes input elements to produce the output.
+As a part of the manual DMA extension, a group of work group copy functions are introduced in addition to `async_work_group_copy`, which is also mapped to a DMA call.

 Here is the list of supported functions:
 ```cpp
@@ -613,7 +617,7 @@ __kernel void grn_NCHW(

 Note the `get_local_size` and `get_local_id` usage inside the kernel. 21x speedup is expected for a kernel on enet-curbs setup because it was completely limited by memory usage.

-An alternative method of using DMA is to use work item copy extension. Those functions are executed inside a kernel and requires work groups equal to single work item.
+An alternative method to using DMA is to use work item copy extension. Those functions are executed inside a kernel and requires work groups equal to single work item.

 Here is the list of supported work item functions:
 ```cpp
--- a/docs/IE_DG/InferenceEngine_QueryAPI.md
+++ b/docs/IE_DG/InferenceEngine_QueryAPI.md
@@ -1,31 +1,39 @@
-Introduction to Inference Engine Device Query API {#openvino_docs_IE_DG_InferenceEngine_QueryAPI}
-===============================
+# Introduction to Inference Engine Device Query API {#openvino_docs_IE_DG_InferenceEngine_QueryAPI}

-This section provides a high-level description of the process of querying of different device properties and configuration values.
-Refer to the [Hello Query Device Sample](../../inference-engine/samples/hello_query_device/README.md) sources and [Multi-Device Plugin guide](supported_plugins/MULTI.md) for example of using the Inference Engine Query API in user applications.
+## Inference Engine Query API (C++)

-## Using the Inference Engine Query API in Your Code
+@sphinxdirective
+.. raw:: html

-The Inference Engine `Core` class provides the following API to query device information, set or get different device configuration properties:
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective

-* <code>InferenceEngine::Core::GetAvailableDevices</code> - Provides a list of available devices. If there are more than one instance of a specific device, the devices are enumerated with `.suffix` where `suffix` is a unique string identifier. The device name can be passed to all methods of the `InferenceEngine::Core` class that work with devices, for example `InferenceEngine::Core::LoadNetwork`.
-* <code>InferenceEngine::Core::GetMetric</code> - Provides information about specific device.
-  <code>InferenceEngine::Core::GetConfig</code> - Gets the current value of a specific configuration key.
-* <code>InferenceEngine::Core::SetConfig</code> - Sets a new value for the configuration key.
+The OpenVINO™ toolkit supports inferencing with several types of devices (processors or accelerators). 
+This section provides a high-level description of the process of querying of different device properties and configuration values at runtime. Refer to the [Hello Query Device С++ Sample](../../inference-engine/samples/hello_query_device/README.md) sources and the [Multi-Device Plugin documentation](supported_plugins/MULTI.md) for examples of using the Inference Engine Query API in user applications.
+
+### Using the Inference Engine Query API in Your Code
+
+The `InferenceEngine::Core` class provides the following API to query device information, set or get different device configuration properties:
+
+* `InferenceEngine::Core::GetAvailableDevices` - Provides a list of available devices. If there are more than one instance of a specific device, the devices are enumerated with `.suffix` where `suffix` is a unique string identifier. The device name can be passed to all methods of the `InferenceEngine::Core` class that work with devices, for example `InferenceEngine::Core::LoadNetwork`.
+* `InferenceEngine::Core::GetMetric` - Provides information about specific device.
+  `InferenceEngine::Core::GetConfig` - Gets the current value of a specific configuration key.
+* `InferenceEngine::Core::SetConfig` - Sets a new value for the configuration key.

 The `InferenceEngine::ExecutableNetwork` class is also extended to support the Query API:

-* <code>InferenceEngine::ExecutableNetwork::GetMetric</code>
-* <code>InferenceEngine::ExecutableNetwork::GetConfig</code>
-* <code>InferenceEngine::ExecutableNetwork::SetConfig</code>
+* `InferenceEngine::ExecutableNetwork::GetMetric`
+* `InferenceEngine::ExecutableNetwork::GetConfig`
+* `InferenceEngine::ExecutableNetwork::SetConfig`

-## Query API in the Core Class
+### Query API in the Core Class

-### GetAvailableDevices
+#### GetAvailableDevices

@snippet snippets/InferenceEngine_QueryAPI0.cpp part0

-The function returns list of available devices, for example:
+The function returns a list of available devices, for example:
+
 ```
 MYRIAD.1.2-ma2480
 MYRIAD.1.4-ma2480
@@ -34,24 +42,23 @@ FPGA.1
 CPU
 GPU.0
 GPU.1
-...
 ```

 Each device name can then be passed to:

 * `InferenceEngine::Core::LoadNetwork` to load the network to a specific device.
 * `InferenceEngine::Core::GetMetric` to get common or device specific metrics.
-* All other methods of the `Core` class that accept `deviceName`.
+* All other methods of the `InferenceEngine::Core` class that accept `deviceName`.

-### GetConfig()
+#### GetConfig()

-The code below demonstrates how to understand whether `HETERO` device dumps `.dot` files with split graphs during the split stage:
+The code below demonstrates how to understand whether the `HETERO` device dumps GraphViz `.dot` files with split graphs during the split stage:

@snippet snippets/InferenceEngine_QueryAPI1.cpp part1

 For documentation about common configuration keys, refer to `ie_plugin_config.hpp`. Device specific configuration keys can be found in corresponding plugin folders.

-### GetMetric()
+#### GetMetric()

 * To extract device properties such as available device, device name, supported configuration keys, and others, use the `InferenceEngine::Core::GetMetric` method:

@@ -59,26 +66,175 @@ For documentation about common configuration keys, refer to `ie_plugin_config.hp

 A returned value appears as follows: `Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz`.

-> **NOTE**: All metrics have specific type, which is specified during metric instantiation. The list of common device-agnostic metrics can be found in `ie_plugin_config.hpp`. Device specific metrics (for example, for `HDDL`, `MYRIAD` devices) can be found in corresponding plugin folders.
+> **NOTE**: All metrics have a type, which is specified during metric instantiation. The list of common device-agnostic metrics can be found in `ie_plugin_config.hpp`. Device specific metrics (for example, for HDDL or MYRIAD devices) can be found in corresponding plugin folders.

-## Query API in the ExecutableNetwork Class
+### Query API in the ExecutableNetwork Class

-### GetMetric()
+#### GetMetric()

-The method is used to get executable network specific metric such as `METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)`:
+The method is used to get an executable network specific metric such as `METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS)`:

@snippet snippets/InferenceEngine_QueryAPI3.cpp part3

-Or the current temperature of `MYRIAD` device:
+Or the current temperature of the `MYRIAD` device:

@snippet snippets/InferenceEngine_QueryAPI4.cpp part4

-### GetConfig()
+#### GetConfig()

 The method is used to get information about configuration values the executable network has been created with:

@snippet snippets/InferenceEngine_QueryAPI5.cpp part5

-### SetConfig()
+#### SetConfig()

 The only device that supports this method is [Multi-Device](supported_plugins/MULTI.md).
+
+## Inference Engine Query API (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+This section provides a high-level description of the process of querying of different device properties and configuration values. Refer to the [Hello Query Device Python Sample](../../inference-engine/ie_bridges/python/sample/hello_query_device/README.md) sources and the [Multi-Device Plugin documentation](supported_plugins/MULTI.md) for examples of using the Inference Engine Query API in user applications.
+
+### Using the Inference Engine Query API in Your Code
+
+The Inference Engine [Core](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino-inference-engine-iecore) class provides the following API to query device information, set or get different device configuration properties:
+
+* [ie_api.IECore.available_devices](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.available_devices) - Provides a list of available devices. If there are more than one instance of a specific device, the devices are enumerated with .suffix where suffix is a unique string identifier. The device name can be passed to all methods of the IECore class that work with devices, for example [ie_api.IECore.load_network](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.load_network).
+* [ie_api.ieCore.get_metric](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.get_metric) - Provides information about specific device.
+* [ie_api.IECore.get_config](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.get_config) - Gets the current value of a specific configuration key.
+* [ie_api.IECore.set_config](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.set_config)  - Sets a new value for the configuration key.
+
+The [ie_api.ExecutableNetwork](api/ie_python_api/_autosummary/openvino.inference_engine.ExecutableNetwork.html) class is also extended to support the Query API:
+* [ie_api.ExecutableNetwork.get_metric](api/ie_python_api/_autosummary/openvino.inference_engine.ExecutableNetwork.html#openvino.inference_engine.ExecutableNetwork.get_metric)
+* [ie_api.ExecutableNetwork.get_config](latest/api/ie_python_api/_autosummary/openvino.inference_engine.ExecutableNetwork.html#openvino.inference_engine.ExecutableNetwork.get_config)
+* There is no method to call for set_config, but the equivalent action is described below.
+
+### Query API in the IECore Class
+
+#### Get Available Devices
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+print(ie.available_devices)
+```
+
+This code prints a list of available devices, for example:
+
+```
+MYRIAD.1.2-ma2480
+MYRIAD.1.4-ma2480
+FPGA.0
+FPGA.1
+CPU
+GPU.0
+GPU.1
+```
+
+Each device name can then be passed to:
+
+* `IECore.load_network` to load the network to a specific device.
+* `IECore.get_metric` to get common or device specific metrics.
+* All other methods of the `IECore` class that accept `deviceName`.
+
+#### Get Metric
+
+To extract device properties such as available device, device name, supported configuration keys, and others, use the [IECore.get_metric](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.get_metric) method:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.get_metric(device_name="CPU", metric_name="FULL_DEVICE_NAME")
+```
+
+A returned value appears as follows: `Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz`.
+
+To list all supported metrics for a device:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.get_metric(device_name="GPU", metric_name="SUPPORTED_METRICS")
+```
+
+> **NOTE**: All metrics have a specific type, which is set during the metric instantiation. The list of common device-agnostic metrics can be found in ie_plugin_config.hpp. Device specific metrics (for example, for HDDL, MYRIAD devices) can be found in corresponding plugin folders.
+
+
+#### Get Configuration
+
+The code below uses the [IECore.get_config](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.get_config) method and demonstrates how to understand whether the HETERO device dumps .dot files with split graphs during the split stage:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.get_config(device_name="HETERO", config_name="HETERO_DUMP_GRAPH_DOT")
+```
+
+To list all supported configuration keys for a device:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.get_metric(device_name=device, metric_name="SUPPORTED_CONFIG_KEYS")
+```
+
+For documentation about common configuration keys, refer to `ie_plugin_config.hpp`. Device specific configuration keys can be found in corresponding plugin folders.
+
+
+### Query API in the ExecutableNetwork Class
+
+#### Get Metric
+
+To get the name of the loaded network:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_xml_file)
+exec_net = ie.load_network(network=net, device_name=device)
+exec_net.get_metric("NETWORK_NAME")
+```
+
+Use `exec_net.get_metric("SUPPORTED_METRICS")` to list all supported metrics for an ExecutableNetwork instance.
+
+
+#### Get Configuration
+
+The [IECore.get_config](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.get_config) method is used to get information about configuration values the executable network has been created with:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_xml_file)
+exec_net = ie.load_network(network=net, device_name="CPU")
+exec_net.get_config("CPU_THREADS_NUM")
+```
+
+Or the current temperature of MYRIAD device:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_xml_file)
+exec_net = ie.load_network(network=net, device_name="MYRIAD")
+exec_net.get_config("DEVICE_THERMAL")
+```
+
+Use `exec_net.get_metric("SUPPORTED_CONFIG_KEYS")`  to list all supported configuration keys.
+
+#### Set Configuration
+
+The only device that supports this method in the ExecutableNetwork class is the [Multi-Device](supported_plugins/MULTI.md), where you can change the priorities of the devices for the Multi plugin in real time: `exec_net.set_config({{"MULTI_DEVICE_PRIORITIES", "GPU,CPU"}})`. See the Multi-Device documentation for more details.
--- a/docs/IE_DG/Int8Inference.md
+++ b/docs/IE_DG/Int8Inference.md
@@ -11,31 +11,37 @@ Low-precision 8-bit inference is optimized for:
 - Intel® processor graphics:
  - Intel® Iris® Xe Graphics
  - Intel® Iris® Xe MAX Graphics
- A model must be quantized. You can use a quantized model from [OpenVINO™ Toolkit Intel's Pre-Trained Models](@ref omz_models_group_intel) or quantize a model yourself. For quantization, you can use the:
-  - [Post-Training Optimization Tool](@ref pot_README) delivered with the Intel® Distribution of OpenVINO™ toolkit release package.
-  - [Neural Network Compression Framework](https://www.intel.com/content/www/us/en/artificial-intelligence/posts/openvino-nncf.html) available on GitHub: https://github.com/openvinotoolkit/nncf

 ## Introduction

-A lot of investigation was made in the field of deep learning with the idea of using low precision computations during inference in order to boost deep learning pipelines and gather higher performance. For example, one of the popular approaches is to shrink the precision of activations and weights values from `fp32` precision to smaller ones, for example, to `fp11` or `int8`. For more information about this approach, refer to 
-**Brief History of Lower Precision in Deep Learning** section in [this whitepaper](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training).
+For 8-bit integer computation, a model must be quantized. You can use a quantized model from [OpenVINO™ Toolkit Intel's Pre-Trained Models](@ref omz_models_group_intel) or quantize a model yourself. For quantization, you can use the following:
+- [Post-Training Optimization Tool](@ref pot_docs_LowPrecisionOptimizationGuide) delivered with the Intel® Distribution of OpenVINO™ toolkit release package
+- [Neural Network Compression Framework](https://www.intel.com/content/www/us/en/artificial-intelligence/posts/openvino-nncf.html) available on GitHub: https://github.com/openvinotoolkit/nncf

-8-bit computations (referred to as `int8`) offer better performance compared to the results of inference in higher precision (for example, `fp32`), because they allow loading more data into a single processor instruction. Usually the cost for significant boost is a reduced accuracy. However, it is proved that an accuracy drop can be negligible and depends on task requirements, so that the application engineer can set up the maximum accuracy drop that is acceptable.
+The quantization process adds [FakeQuantize](../ops/quantization/FakeQuantize_1.md) layers on activations and weights for most layers. Read more about mathematical computations in the [Uniform Quantization with Fine-Tuning](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md).

+When you pass the quantized IR to the OpenVINO™ plugin, the plugin automatically recognizes it as a quantized model and performs 8-bit inference. Note that if you pass a quantized model to another plugin that does not support 8-bit inference but supports all operations from the model, the model is inferred in precision that this plugin supports.
+
+At runtime, the quantized model is loaded to the plugin. The plugin uses the `Low Precision Transformation` component to update the model to infer it in low precision:
+   - Update `FakeQuantize` layers to have quantized output tensors in low-precision range and add dequantization layers to compensate for the update. Dequantization layers are pushed through as many layers as possible to have more layers in low precision. After that, most layers have quantized input tensors in low-precision range and can be inferred in low precision. Ideally, dequantization layers should be fused in the next `FakeQuantize` layer.
+   - Weights are quantized and stored in `Constant` layers. 
+
+## Prerequisites
+
+Let's explore quantized [TensorFlow* implementation of the ResNet-50](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) model. Use [Model Downloader](@ref omz_tools_downloader) to download the `FP16` model from [OpenVINO™ Toolkit - Open Model Zoo repository](https://github.com/openvinotoolkit/open_model_zoo):

-Let's explore quantized [TensorFlow* implementation of ResNet-50](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) model. Use [Model Downloader](@ref omz_tools_downloader) tool to download the `fp16` model from [OpenVINO™ Toolkit - Open Model Zoo repository](https://github.com/openvinotoolkit/open_model_zoo):
 ```sh
-./downloader.py --name resnet-50-tf --precisions FP16-INT8
+<omz_dir>//tools/downloader/downloader.py --name resnet-50-tf --precisions FP16-INT8
 ```
-After that you should quantize model by the [Model Quantizer](@ref omz_tools_downloader) tool.
+After that you should quantize the model with the [Model Quantizer](@ref omz_tools_downloader) tool.
 ```sh
-./quantizer.py --model_dir public/resnet-50-tf --dataset_dir <DATASET_DIR> --precisions=FP16-INT8
+<omz_dir>//tools/downloader/quantizer.py --model_dir public/resnet-50-tf --dataset_dir <DATASET_DIR> --precisions=FP16-INT8
 ```
-The simplest way to infer the model and collect performance counters is [C++ Benchmark Application](../../inference-engine/samples/benchmark_app/README.md). 
+The simplest way to infer the model and collect performance counters is the [C++ Benchmark Application](../../inference-engine/samples/benchmark_app/README.md). 
 ```sh
 ./benchmark_app -m resnet-50-tf.xml -d CPU -niter 1 -api sync -report_type average_counters  -report_folder pc_report_dir
 ```
-If you infer the model with the OpenVINO™ CPU plugin and collect performance counters, all operations (except last not quantized SoftMax) are executed in INT8 precision.  
+If you infer the model with the OpenVINO™ CPU plugin and collect performance counters, all operations (except the last non-quantized SoftMax) are executed in INT8 precision.  

 ## Low-Precision 8-bit Integer Inference Workflow

@@ -70,14 +76,12 @@ available from the Inference Engine API. For example, the part of performance co
 | resnet\_model/add\_5/fq\_input\_1                         | NOT\_RUN   | FakeQuantize | undef                | 0             | 0            |


-> The `exeStatus` column of the table includes possible values:
-> - `EXECUTED` - layer was executed by standalone primitive,
-> - `NOT_RUN` - layer was not executed by standalone primitive or was fused with another operation and executed in another layer primitive.  
->
-> The `execType` column of the table includes inference primitives with specific suffixes. The layers have the following marks:
-> * Suffix `I8` for layers that had 8-bit data type input and were computed in 8-bit precision
-> * Suffix `FP32` for layers computed in 32-bit precision 
+   The `exeStatus` column of the table includes possible values:
+   - `EXECUTED` - layer was executed by standalone primitive,
+   - `NOT_RUN` - layer was not executed by standalone primitive or was fused with another operation and executed in another layer primitive.  
+   
+   The `execType` column of the table includes inference primitives with specific suffixes. The layers have the following marks:
+   * Suffix `I8` for layers that had 8-bit data type input and were computed in 8-bit precision
+   * Suffix `FP32` for layers computed in 32-bit precision 

-All `Convolution` layers are executed in int8 precision. Rest layers are fused into Convolutions using post operations optimization technique, which is described in [Internal CPU Plugin Optimizations](supported_plugins/CPU.md).
-
-[int8_flow]: img/cpu_int8_flow.png
+   All `Convolution` layers are executed in int8 precision. Rest layers are fused into Convolutions using post operations optimization technique, which is described in [Internal CPU Plugin Optimizations](supported_plugins/CPU.md).
--- a/docs/IE_DG/Integrate_with_customer_application_new_API.md
+++ b/docs/IE_DG/Integrate_with_customer_application_new_API.md
@@ -1,131 +1,252 @@
-Integrate the Inference Engine with Your Application {#openvino_docs_IE_DG_Integrate_with_customer_application_new_API}
-===============================
+# Integrate Inference Engine {#openvino_docs_IE_DG_Integrate_with_customer_application_new_API}

-This section provides a high-level description of the process of integrating the Inference Engine into your application.
-Refer to the [Hello Classification Sample](../../inference-engine/samples/hello_classification/README.md) sources
-for example of using the Inference Engine in applications.
+## Integrate Inference Engine with Your C++ Application

-## Use the Inference Engine API in Your Code
+@sphinxdirective
+.. raw:: html

-The core `libinference_engine.so` library implements loading and parsing a model Intermediate Representation (IR), and triggers inference using a specified device. The core library has the following API:
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective

-* `InferenceEngine::Core`
-* `InferenceEngine::Blob`, `InferenceEngine::TBlob`,
-  `InferenceEngine::NV12Blob`
-* `InferenceEngine::BlobMap`
-* `InferenceEngine::InputsDataMap`, `InferenceEngine::InputInfo`,
-* `InferenceEngine::OutputsDataMap`
+The following diagram illustrates the typical Inference Engine С++ API workflow:

-C++ Inference Engine API wraps the capabilities of core library:
+![ie_api_flow_cpp]

-* `InferenceEngine::CNNNetwork`
-* `InferenceEngine::ExecutableNetwork`
-* `InferenceEngine::InferRequest`
+Read the sections below to learn about each item.

-## Integration Steps
+> **NOTE**: Before start using Inference Engine, make sure you set all environment variables during the installation. If you did not, follow the instructions from the _Set the Environment Variables_ section in the installation guides:
+> * [For Windows* 10](../install_guides/installing-openvino-windows.md)
+> * [For Linux*](../install_guides/installing-openvino-linux.md)
+> * [For macOS*](../install_guides/installing-openvino-macos.md)
+> * To build an open source version, use the [Inference Engine Build Instructions](https://github.com/openvinotoolkit/openvino/wiki/BuildingCode).

-Integration process includes the following steps:
-![integration_process]
+### Link with Inference Library

-1) **Create Inference Engine Core** to manage available devices and read network objects:
+1. **Create a structure** for the project:
+   ``` sh
+   project/
+       ├── CMakeLists.txt  - CMake file to build
+       ├── ...             - Additional folders like includes/
+       └── src/            - source folder
+           └── main.cpp
+   build/                  - build directory
+       ...      
+   ```
+
+2. **Include Inference Engine, nGraph and OpenCV libraries** in `project/CMakeLists.txt`  
+[OpenCV](https://docs.opencv.org/master/db/df5/tutorial_linux_gcc_cmake.html) integration is needed mostly for pre-processing input data and nGraph for more complex applications using [nGraph API](../nGraph_DG/nGraph_dg.md).
+   ``` cmake
+   cmake_minimum_required(VERSION 3.0.0)
+   project(project_name)
+   find_package(ngraph REQUIRED)
+   find_package(InferenceEngine REQUIRED)
+   find_package(OpenCV REQUIRED)
+   add_executable(${PROJECT_NAME} src/main.cpp)
+   target_link_libraries(${PROJECT_NAME} PRIVATE ${InferenceEngine_LIBRARIES} ${OpenCV_LIBS}    ${NGRAPH_LIBRARIES})
+   ```
+
+### Use Inference Engine API to Implement Inference Pipeline
+
+This section provides step-by-step instructions to implement a typical inference pipeline with the Inference Engine C++ API:   
+
+![ie_api_use_cpp]
+#### Step 1. Create Inference Engine Core 
+
+Use the following code to create Inference Engine Core to manage available devices and read network objects:

@snippet snippets/Integrate_with_customer_application_new_API.cpp part0

-2) **Read a model IR** created by the Model Optimizer (.xml is supported format):
+#### Step 2 (Optional). Configure Input and Output of the Model

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part1
+@sphinxdirective
+.. raw:: html

-**Or read the model from ONNX format** (.onnx and .prototxt are supported formats). You can find more information about the ONNX format support in the document [ONNX format support in the OpenVINO™](./ONNX_Support.md).
+    <div class="collapsible-section">
+@endsphinxdirective
+    

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part2
+Optionally, configure input and output of the model using the steps below:

-3) **Configure input and output**. Request input and output information using `InferenceEngine::CNNNetwork::getInputsInfo()`, and `InferenceEngine::CNNNetwork::getOutputsInfo()`
-methods:
+1. Load a model to a Core object:
+   @sphinxdirective
+   
+   .. tab:: IR
+   
+      .. code-block:: c
+   
+         auto network  = core.ReadNetwork("model.xml");

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part3
+   .. tab:: ONNX
+      
+      .. code-block:: c
+         
+         auto network = core.ReadNetwork("model.onnx");

-  Optionally, set the number format (precision) and memory layout for inputs and outputs. Refer to the
-  [Supported configurations](supported_plugins/Supported_Devices.md) chapter to choose the relevant configuration.
+      You can find more information about the ONNX format support in the document `ONNX format support in the OpenVINO™ <https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_ONNX_Support.html>`_   
+   
+   .. tab:: nGraph
+      
+      .. code-block:: c
+         
+         std::shared_ptr<Function> createNetwork() {
+            // To construct a network, please follow 
+            // https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_build_function.html
+         }
+         auto network = CNNNetwork(createNetwork());

-  You can also allow input of any size. To do this, mark each input as resizable by setting a desired resize algorithm (e.g. `BILINEAR`) inside of the appropriate input info.
+   @endsphinxdirective

-  Basic color format conversions are supported as well. By default, the Inference Engine assumes
-  that the input color format is `BGR` and color format conversions are disabled. The Inference
-  Engine supports the following color format conversions:
-  * `RGB->BGR`
-  * `RGBX->BGR`
-  * `BGRX->BGR`
-  * `NV12->BGR`
+2. Request input and output information using `InferenceEngine::CNNNetwork::getInputsInfo()`, and `InferenceEngine::CNNNetwork::getOutputsInfo()` methods:
+   ```cpp
+   /** Take information about all topology inputs **/
+   InferenceEngine::InputsDataMap input_info = network.getInputsInfo();
+   /** Iterate over all input info**/
+   for (auto &item : input_info) {
+       auto input_data = item.second;
+           // Add your input configuration steps here
+   }
+   
+   /** Take information about all topology outputs **/
+   InferenceEngine::OutputsDataMap output_info = network.getOutputsInfo();
+   /** Iterate over all output info**/
+   for (auto &item : output_info) {
+       auto output_data = item.second;
+           // Add your output configuration steps here
+   }
+   ```
+   Configuring options:
+   1. **Set precision** (number format): FP16, FP32, INT8, etc. Refer to the Supported Configurations section on the [Supported Devices](supported_plugins/Supported_Devices.md) page to choose the relevant configuration.<br>
+   For input (*iterate over all input info*):
+   ```cpp
+   input_data->setPrecision(InferenceEngine::Precision::U8);
+   ```
+   For output  (*iterate over all output info*):
+   ```cpp
+   output_data->setPrecision(InferenceEngine::Precision::FP32);
+   ```
+   **By default**, the input and output precision is set to `Precision::FP32`.

-  where `X` is a channel that will be ignored during inference. To enable the conversions, set a
-  desired color format (for example, `RGB`) for each input inside of the appropriate input info.
+   2. **Set layout** (NCHW, ).<br>
+   For input (*iterate over all input info*):
+   ```cpp
+   input_data->setLayout(InferenceEngine::Layout::NCHW);
+   ```
+   **By default**, the input layout is set to `Layout::NCHW`.<br>
+   For output (*iterate over all output info*):
+   ```cpp
+   output_data->setLayout(InferenceEngine::Layout::NC);
+   ```
+      **By default**, the output layout depends on a number of its dimensions:<br>
+      |Number of dimensions |  5    |  4   |   3 |  2 |  1 |
+      |:--------------------|-------|------|-----|----|----|
+      |Layout               | NCDHW | NCHW | CHW | NC | C  |
+   3. **Set resize algorithm for inputs** (Bilinear). You can allow input of any size. To do this, mark each input as resizable by setting a desired resize algorithm (e.g. `BILINEAR`) inside of the appropriate input info (*Iterate over all input info*):
+   ```cpp
+   input_data->getPreProcess().setResizeAlgorithm(InferenceEngine::RESIZE_BILINEAR);
+   ```
+   **By default**, no resize algorithm is set for inputs.

-  If you want to run inference for multiple images at once, you can use the built-in batch
-  pre-processing functionality.
+   4. **Set color format** (BGR, RGB, NV12). Basic color format conversions are supported as well. **By default**, the Inference Engine assumes that the input color format is BGR and color format conversions are disabled. Set `ColorFormat::RAW` input color format if the input does not need color conversions. The Inference Engine supports the following color format conversions:
+      * RGB->BGR
+      * RGBX->BGR
+      * BGRX->BGR
+      * NV12->BGR
+      where X is a channel that will be ignored during inference. To enable the conversions, set a desired color format (for example, RGB) for each input inside of the appropriate input info (*iterate over all input info*):
+   ```cpp
+   input_data->getPreProcess().setColorFormat(InferenceEngine::ColorFormat::RGB);
+   ```
+   > **NOTE**: NV12 input color format pre-processing differs from other color conversions. In case of NV12, Inference Engine expects two separate image planes (Y and UV). You must use a specific `InferenceEngine::NV12Blob` object instead of default blob object and set this blob to the Inference Engine Infer Request using `InferenceEngine::InferRequest::SetBlob()`. Refer to [Hello NV12 Input Classification C++ Sample](../../inference-engine/samples/hello_nv12_input_classification/README.md) for more details.
+   
+   5. **Run on multiple images** with setting batch. If you want to run inference for multiple images at once, you can use the built-in batch pre-processing functionality.
+   
+      **NOTE** : Batch pre-processing is not supported if input color format is set to `ColorFormat::NV12`.

-> **NOTE**: Batch pre-processing is not supported if input color format is set to `ColorFormat::NV12`.
+@sphinxdirective
+.. raw:: html

-  You can use the following code snippet to configure input and output:
+    </div>
+@endsphinxdirective

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part4
+#### Step 3. Load the Model to the Device

-> **NOTE**: NV12 input color format pre-processing differs from other color conversions. In case of NV12,
->  Inference Engine expects two separate image planes (Y and UV). You must use a specific
->  `InferenceEngine::NV12Blob` object instead of default blob object and set this blob to
->  the Inference Engine Infer Request using `InferenceEngine::InferRequest::SetBlob()`.
->  Refer to [Hello NV12 Input Classification C++ Sample](../../inference-engine/samples/hello_nv12_input_classification/README.md)
->  for more details.
+Load the model to the device using `InferenceEngine::Core::LoadNetwork()`:

-  If you skip this step, the default values are set:

-  * no resize algorithm is set for inputs
-  * input color format - `ColorFormat::RAW` meaning that input does not need color
-    conversions
-  * input and output precision - `Precision::FP32`
-  * input layout - `Layout::NCHW`
-  * output layout depends on number of its dimensions:
+@sphinxdirective
+   
+.. tab:: IR

-|Number of dimensions |  5    |  4   |   3 |  2 |  1 |
-|:--------------------|-------|------|-----|----|----|
-|Layout               | NCDHW | NCHW | CHW | NC | C  |
+   .. code-block:: c

-4) **Load the model** to the device using `InferenceEngine::Core::LoadNetwork()`:
+      executable_network = core.LoadNetwork("model.xml", "CPU");

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part5
+.. tab:: ONNX

-    It creates an executable network from a network object. The executable network is associated with single hardware device.
-    It is possible to create as many networks as needed and to use them simultaneously (up to the limitation of the hardware resources).
-    Third parameter is a configuration for plugin. It is map of pairs: (parameter name, parameter value). Choose device from
-     [Supported devices](supported_plugins/Supported_Devices.md) page for more details about supported configuration parameters.
+   .. code-block:: c
+
+      executable_network = core.LoadNetwork("model.onnx", "CPU");
+
+.. tab:: nGraph
+
+   .. code-block:: c
+
+      std::shared_ptr<Function> createNetwork() {
+         // To construct a network, please follow 
+         // https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_build_function.html
+      }
+      auto network = CNNNetwork(createNetwork());
+      executable_network = core.LoadNetwork(network, "CPU");
+
+.. tab:: Model From Step 2
+   
+   Follow this step only if you went through optional "Step 2 (Optional). Configure Input and Output of the Model", otherwise use another tab for your model type: IR (OpenVINO Intermediate Representation), ONNX or nGraph.
+   
+   .. code-block:: c
+
+      executable_network = core.LoadNetwork(network, "CPU");
+
+@endsphinxdirective
+
+
+It creates an executable network from a network object. The executable network is associated with single hardware device.
+It is possible to create as many networks as needed and to use them simultaneously (up to the limitation of the hardware resources).
+
+Third parameter is a configuration for plugin. It is map of pairs: (parameter name, parameter value). Choose device from
+[Supported devices](supported_plugins/Supported_Devices.md) page for more details about supported configuration parameters.

@snippet snippets/Integrate_with_customer_application_new_API.cpp part6

-5) **Create an infer request**:
+#### Step 4. Create an Inference Request
+
+Create an infer request using the following code:

@snippet snippets/Integrate_with_customer_application_new_API.cpp part7

-6) **Prepare input**. You can use one of the following options to prepare input:
-    * **Optimal way for a single network.** Get blobs allocated by an infer request using `InferenceEngine::InferRequest::GetBlob()` and feed an image and the input data to the blobs. In this case, input data must be aligned (resized manually) with a given blob size and have a correct color format.
+#### Step 5. Prepare Input 

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part8
+You can use one of the following options to prepare input:

-    * **Optimal way for a cascade of networks (output of one network is input for another).** Get output blob from the first request using `InferenceEngine::InferRequest::GetBlob()` and set it as input for the second request using `InferenceEngine::InferRequest::SetBlob()`.
+* **Optimal way for a single network.** Get blobs allocated by an infer request using `InferenceEngine::InferRequest::GetBlob()` and feed an image and the input data to the blobs. In this case, input data must be aligned (resized manually) with a given blob size and have a correct color format.

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part9
+   @snippet snippets/Integrate_with_customer_application_new_API.cpp part8

-    * **Optimal way to handle ROI (a ROI object located inside of input of one network is input for another).** It is possible to re-use shared input by several networks. You do not need to allocate separate input blob for a network if it processes a ROI object located inside of already allocated input of a previous network. For instance, when first network detects objects on a video frame (stored as input blob) and second network accepts detected bounding boxes (ROI inside of the frame) as input. In this case, it is allowed to re-use pre-allocated input blob (used by first network) by second network and just crop ROI without allocation of new memory using `InferenceEngine::make_shared_blob()` with passing of `InferenceEngine::Blob::Ptr` and `InferenceEngine::ROI` as parameters.
+* **Optimal way for a cascade of networks (output of one network is input for another).** Get output blob from the first request using `InferenceEngine::InferRequest::GetBlob()` and set it as input for the second request using `InferenceEngine::InferRequest::SetBlob()`.

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part10
+   @snippet snippets/Integrate_with_customer_application_new_API.cpp part9

-Make sure that shared input is kept valid during execution of each network. Otherwise, ROI blob may be corrupted if the original input blob (that ROI is cropped from) has already been rewritten.
+* **Optimal way to handle ROI (a ROI object located inside of input of one network is input for another).** It is possible to re-use shared input by several networks. You do not need to allocate separate input blob for a network if it processes a ROI object located inside of already allocated input of a previous network. For instance, when first network detects objects on a video frame (stored as input blob) and second network accepts detected bounding boxes (ROI inside of the frame) as input. In this case, it is allowed to re-use pre-allocated input blob (used by first network) by second network and just crop ROI without allocation of new memory using `InferenceEngine::make_shared_blob()` with passing of `InferenceEngine::Blob::Ptr` and `InferenceEngine::ROI` as parameters.

-    * Allocate input blobs of the appropriate types and sizes, feed an image and the input data to the blobs, and call `InferenceEngine::InferRequest::SetBlob()` to set these blobs for an infer request:
+   @snippet snippets/Integrate_with_customer_application_new_API.cpp part10

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part11
+   Make sure that shared input is kept valid during execution of each network. Otherwise, ROI blob may be corrupted if the original input blob (that ROI is cropped from) has already been rewritten.

-      A blob can be filled before and after `SetBlob()`.
+* Allocate input blobs of the appropriate types and sizes, feed an image and the input data to the blobs, and call `InferenceEngine::InferRequest::SetBlob()` to set these blobs for an infer request:

-> **NOTE:**
+   @snippet snippets/Integrate_with_customer_application_new_API.cpp part11
+
+A blob can be filled before and after `SetBlob()`.
+
+> **NOTE**:
 >
 > * The `SetBlob()` method compares precision and layout of an input blob with the ones defined in step 3 and
 > throws an exception if they do not match. It also compares a size of the input blob with input
@@ -141,22 +262,28 @@ Make sure that shared input is kept valid during execution of each network. Othe
 > corresponding values of the read network. No pre-processing will happen for this blob. If you
 > call `GetBlob()` after `SetBlob()`, you will get the blob you set in `SetBlob()`.

-7) **Do inference** by calling the `InferenceEngine::InferRequest::StartAsync` and `InferenceEngine::InferRequest::Wait` methods for asynchronous request:
+#### Step 6. Start Inference

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part12
+Start inference in asynchronous or synchronous mode. Async API usage can improve overall frame-rate of the application, because rather than wait for inference to complete, the app can continue doing things on the host, while accelerator is busy.

-or by calling the `InferenceEngine::InferRequest::Infer` method for synchronous request:
+* For synchronous inference request:
+   ```cpp
+   infer_request.Infer();
+   ```

-@snippet snippets/Integrate_with_customer_application_new_API.cpp part13
+* For asynchronous inference request: 
+  ```cpp
+  infer_request.StartAsync();
+  infer_request.Wait(InferenceEngine::InferRequest::WaitMode::RESULT_READY);
+  ```
+  `StartAsync` returns immediately and starts inference without blocking main thread, `Infer` blocks main thread and returns when inference is completed. Call `Wait` for waiting result to become available for asynchronous request.

-`StartAsync` returns immediately and starts inference without blocking main thread, `Infer` blocks
- main thread and returns when inference is completed. Call `Wait` for waiting result to become available for asynchronous request.
-
-There are three ways to use it:
-* specify maximum duration in milliseconds to block for. The method is blocked until the specified timeout has elapsed, or the result becomes available, whichever comes first.
-* `InferenceEngine::InferRequest::WaitMode::RESULT_READY` - waits until inference result becomes available
-* `InferenceEngine::InferRequest::WaitMode::STATUS_ONLY` - immediately returns request status.It does not
-block or interrupts current thread.
+  There are three ways to use it:
+      * specify maximum duration in milliseconds to block for. The method is blocked until the specified timeout has elapsed, or the result becomes available, whichever comes first.
+      * `InferenceEngine::InferRequest::WaitMode::RESULT_READY` - waits until inference result becomes available
+      * `InferenceEngine::InferRequest::WaitMode::STATUS_ONLY` - immediately returns request status.It does not
+      block or interrupts current thread.
+   

 Both requests are thread-safe: can be called from different threads without fearing corruption and failures.

@@ -165,42 +292,21 @@ Multiple requests for single `ExecutableNetwork` are executed sequentially one b
 While request is ongoing, all its methods except `InferenceEngine::InferRequest::Wait` would throw an
 exception.

-8) Go over the output blobs and **process the results**.
-Note that casting `Blob` to `TBlob` via `std::dynamic_pointer_cast` is not the recommended way. It's better to access data via the `buffer()` and `as()` methods as follows:
+#### Step 7. Process the Inference Results 
+
+Go over the output blobs and process the inference results. Note that casting `Blob` to `TBlob` via `std::dynamic_pointer_cast` is not the recommended way. It's better to access data via the `buffer()` and `as()` methods as follows:

@snippet snippets/Integrate_with_customer_application_new_API.cpp part14

-## Build Your Application
+### Build Your Application

 For details about building your application, refer to the CMake files for the sample applications.
 All samples source code is located in the `<INSTALL_DIR>/openvino/inference_engine/samples` directory, where `INSTALL_DIR` is the OpenVINO™ installation directory.

-### CMake project creation
+To build your project using CMake with the default build tools currently available on your machine, execute the following commands:

-1. **Create a structure** for the project:
-``` sh
-project/
-    ├── CMakeLists.txt  - CMake file to build
-    ├── ...             - Additional folders like includes/
-    └── src/            - source folder
-        └── main.cpp
-build/                  - build directory
-    ...      
-```
+> **NOTE**: Make sure you set environment variables first by running `<INSTALL_DIR>/bin/setupvars.sh` (or `setupvars.bat` for Windows). Otherwise the `InferenceEngine_DIR` and `OpenCV_DIR` variables won't be configured properly to pass `find_package` calls.

-2. **Include Inference Engine, nGraph and OpenCV libraries** in `project/CMakeLists.txt`  
-[OpenCV](https://docs.opencv.org/master/db/df5/tutorial_linux_gcc_cmake.html) integration is needed mostly for pre-processing input data and ngraph for more complex applications using [ngraph API](../nGraph_DG/nGraph_dg.md).
-``` cmake
-cmake_minimum_required(VERSION 3.0.0)
-project(project_name)
-find_package(ngraph REQUIRED)
-find_package(InferenceEngine REQUIRED)
-find_package(OpenCV REQUIRED)
-add_executable(${PROJECT_NAME} src/main.cpp)
-target_link_libraries(${PROJECT_NAME} PRIVATE ${InferenceEngine_LIBRARIES} ${OpenCV_LIBS} ${NGRAPH_LIBRARIES})
-```
-3. **To build your project** using CMake with the default build tools currently available on your machine, execute the following commands:
-> **NOTE**: Make sure you set environment variables first by running `<INSTALL_DIR>/bin/setupvars.sh` (or setupvars.bat for Windows)`. Otherwise the `InferenceEngine_DIR` and `OpenCV_DIR` variables won't be configured properly to pass `find_package` calls.
 ```sh
 cd build/
 cmake ../project
@@ -217,4 +323,172 @@ Redistributable and Intel® C++ Compiler 2017 Redistributable packages are insta
 `<INSTALL_DIR>/bin/intel64/Release/*.dll` files are placed to the
 application folder or accessible via `%PATH%` environment variable.

-[integration_process]: img/integration_process.png
+## Integrate Inference Engine with Your Python Application
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+This document explains how to integrate and use the Inference Engine API with your Python application.   
+
+The following diagram illustrates the typical Inference Engine Python API workflow:
+![ie_api_flow_python] 
+
+Read the sections below to learn about each item.
+
+### Import Inference Module
+
+To make use of the Inference Engine functionality, import IECore to your application: 
+
+```py
+from openvino.inference_engine import IECore
+``` 
+ 
+### Use Inference Engine API 
+
+This section provides step-by-step instructions to implement a typical inference pipeline with the Inference Engine API:   
+
+![ie_api_use_python]
+
+#### Step 1. Create Inference Engine Core
+
+Use the following code to create Inference Engine Core to manage available devices and read network objects: 
+```py
+ie = IECore()
+``` 
+#### Step 2 (Optional). Read model. Configure Input and Output of the Model
+
+@sphinxdirective
+.. raw:: html
+
+    <div class="collapsible-section">
+@endsphinxdirective
+
+Optionally, configure input and output of the model using the steps below: 
+
+1. Read model 
+   @sphinxdirective
+      
+   .. tab:: IR
+   
+      .. code-block:: python
+   
+         net = ie.read_network(model="model.xml")
+   
+   .. tab:: ONNX
+      
+      .. code-block:: python
+         
+         net = ie.read_network(model="model.onnx")
+   
+   .. tab:: nGraph
+      
+      .. code-block:: python
+         
+         #Basic example of nGraph model creation
+         param = Parameter(Type.f32, Shape([1, 3, 22, 22]))
+         relu = ng.relu(param)
+         func = Function([relu], [param], 'test')
+         caps = Function.to_capsule(func)
+         net = IENetwork(caps)
+   
+   @endsphinxdirective
+
+2. Request input and output information using input_info, outputs 
+   ```py
+   inputs = net.input_info 
+   input_name = next(iter(net.input_info))  
+
+   outputs = net.outputs 
+   output_name = next(iter(net.outputs)) 
+   ``` 
+   Information for this input layer is stored in input_info. The next cell prints the input layout, precision and shape. 
+   ```py
+   print("Inputs:")
+   for name, info in net.input_info.items():
+       print("\tname: {}".format(name))
+       print("\tshape: {}".format(info.tensor_desc.dims))
+       print("\tlayout: {}".format(info.layout))
+       print("\tprecision: {}\n".format(info.precision))
+   ```
+   This cell output tells us that the model expects inputs with a shape of [1,3,224,224], and that this is in NCHW layout. This means that the model expects input data with a batch size (N) of 1, 3 channels (C), and images of a height (H) and width (W) of 224. The input data is expected to be of FP32 (floating point) precision. 
+    
+   Getting the output layout, precision and shape is similar to getting the input layout, precision and shape. 
+   ```py
+   print("Outputs:")
+   for name, info in net.outputs.items():
+       print("\tname: {}".format(name))
+       print("\tshape: {}".format(info.shape))
+       print("\tlayout: {}".format(info.layout))
+       print("\tprecision: {}\n".format(info.precision))
+   ```
+   This cell output shows that the model returns outputs with a shape of [1, 1001], where 1 is the batch size (N) and 1001 the number of classes (C). The output is returned as 32-bit floating point. 
+
+@sphinxdirective
+.. raw:: html
+
+    </div>
+@endsphinxdirective 
+
+#### Step 3. Load model to the Device 
+
+Load the model to the device using `load_network()`:
+
+@sphinxdirective
+   
+.. tab:: IR
+
+   .. code-block:: python
+
+      exec_net = ie.load_network(network= "model.xml", device_name="CPU") 
+.. tab:: ONNX
+   
+   .. code-block:: python
+      
+      exec_net = ie.load_network(network= "model.onnx", device_name="CPU") 
+
+.. tab:: Model from step 2
+   
+   .. code-block:: python
+   
+      exec_net = ie.load_network(network=net, device_name="CPU")
+
+@endsphinxdirective
+
+This example is designed for CPU device, refer to the [Supported Devices](../IE_DG/supported_plugins/Supported_Devices.md) page to read about more devices. 
+
+#### Step 4. Prepare input 
+```py
+import cv2 
+import numpy as np 
+
+image = cv2.imread("image.png") 
+
+# Resize with OpenCV your image if needed to match with net input shape 
+# N, C, H, W = net.input_info[input_name].tensor_desc.dims
+# image = cv2.resize(src=image, dsize=(W, H)) 
+
+# Converting image to NCHW format with FP32 type 
+input_data = np.expand_dims(np.transpose(image, (2, 0, 1)), 0).astype(np.float32) 
+```
+
+#### Step 5. Start Inference
+```py
+result = exec_net.infer({input_name: input_data}) 
+``` 
+
+#### Step 6. Process the Inference Results 
+```py
+output = result[output_name] 
+```
+
+### Run Your Application
+
+Congratulations, you have made your first Python application with OpenVINO™ toolkit, now you may run it.
+
+[ie_api_flow_cpp]: img/BASIC_IE_API_workflow_Cpp.svg
+[ie_api_use_cpp]: img/IMPLEMENT_PIPELINE_with_API_C.svg
+[ie_api_flow_python]: img/BASIC_IE_API_workflow_Python.svg
+[ie_api_use_python]: img/IMPLEMENT_PIPELINE_with_API_Python.svg
--- a/docs/IE_DG/Intro_to_Performance.md
+++ b/docs/IE_DG/Intro_to_Performance.md
@@ -1,4 +1,4 @@
-# Introduction to the Performance Topics {#openvino_docs_IE_DG_Intro_to_Performance}
+#  Runtime Optimization Guide {#openvino_docs_IE_DG_Intro_to_Performance}

 This section is a shorter version of the
 [Optimization Guide](../optimization_guide/dldt_optimization_guide.md) for the Intel® Distribution of OpenVINO™ Toolkit.
@@ -31,6 +31,12 @@ input images to achieve optimal throughput. However, high batch size also comes
 latency penalty. So, for more real-time oriented usages, lower batch sizes (as low as a single input) are used.
 Refer to the [Benchmark App](../../inference-engine/samples/benchmark_app/README.md) sample, which allows latency vs. throughput measuring.

+## Using Caching API for first inference latency optimization
+Since with the 2021.4 release, Inference Engine provides an ability to enable internal caching of loaded networks.
+This can significantly reduce load network latency for some devices at application startup.
+Internally caching uses plugin's Export/ImportNetwork flow, like it is done for [Compile tool](../../inference-engine/tools/compile_tool/README.md), using the regular ReadNetwork/LoadNetwork API.
+Refer to the [Model Caching Overview](Model_caching_overview.md) for more detailed explanation.
+
 ## Using Async API
 To gain better performance on accelerators, such as VPU, the Inference Engine uses the asynchronous approach (see
 [Integrating Inference Engine in Your Application (current API)](Integrate_with_customer_application_new_API.md)).
--- a/docs/IE_DG/Introduction.md
+++ b/docs/IE_DG/Introduction.md
@@ -72,8 +72,8 @@ For the list of supported models refer to the framework or format specific page:
 * [Supported Caffe* models](../MO_DG/prepare_model/convert_model/Convert_Model_From_Caffe.md)
 * [Supported TensorFlow* models](../MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md)
 * [Supported MXNet* models](../MO_DG/prepare_model/convert_model/Convert_Model_From_MxNet.md)
-* [Supported ONNX* models](../MO_DG/prepare_model/convert_model/Convert_Model_From_ONNX.md)
 * [Supported Kaldi* models](../MO_DG/prepare_model/convert_model/Convert_Model_From_Kaldi.md)
+* [Supported ONNX* models](../MO_DG/prepare_model/convert_model/Convert_Model_From_ONNX.md)


 ## Intermediate Representation
--- a/docs/IE_DG/Known_Issues_Limitations.md
+++ b/docs/IE_DG/Known_Issues_Limitations.md
@@ -3,7 +3,8 @@
 ## Multiple OpenMP Loadings

 If the application uses the Inference Engine with third-party components that depend on Intel OpenMP, multiple loadings of the libiomp library may occur and cause OpenMP runtime initialization conflicts. This may happen, for example, if the application uses Intel® Math Kernel Library (Intel® MKL) through the “Single Dynamic Library” (<code>libmkl_rt.so</code>) mechanism and calls Intel MKL after loading the Inference Engine plugin.
-The error log looks as follows:
+The error log looks like this:
+
 ```sh
 OMP: Error #15: Initializing libiomp5.so, but found libiomp5.so already initialized.
 OMP: Hint: This means that multiple copies of the OpenMP runtime have been linked into the program. That is dangerous, since it can degrade performance or cause incorrect results. The best thing to do is to ensure that only a single OpenMP runtime is linked into the process, e.g. by avoiding static linking of the OpenMP runtime in any library. As an unsafe, unsupported, undocumented workaround you can set the environment variable KMP_DUPLICATE_LIB_OK=TRUE to allow the program to continue to execute, but that may cause crashes or silently produce incorrect results. For more information, please see http://www.intel.com/software/products/support/.
@@ -12,20 +13,20 @@ OMP: Hint: This means that multiple copies of the OpenMP runtime have been linke
 Possible workarounds:

 *  Preload the OpenMP runtime using the <code>LD_PRELOAD</code> variable:
-```sh
-LD_PRELOAD=<path_to_libiomp5.so> <path_to your_executable>
-```
+   ```sh
+   LD_PRELOAD=<path_to_libiomp5.so> <path_to your_executable>
+   ```
   This eliminates multiple loadings of libiomp, and makes all the components use this specific version of OpenMP.

-*  Alternatively, you can set <code>KMP_DUPLICATE_LIB_OK=TRUE</code>. However, performance degradation or results incorrectness may occur in this case.
+*  Alternatively, you can set <code>KMP_DUPLICATE_LIB_OK=TRUE</code>. However, performance degradation or incorrect results may occur in this case.


 ## Old proto compiler breaks protobuf library

-With python protobuf library version 3.5.1 the following incompatibility can happen.
-The known case is for Cent OS 7.4
+With python protobuf library version 3.5.1, the following incompatibility can happen.
+The known case is for Cent OS 7.4.

-The error log looks as follows:
+The error log looks like this:

 ```sh
 File "../lib64/python3.5/site-packages/google/protobuf/descriptor.py", line 829, in _new_
@@ -33,25 +34,24 @@ return _message.default_pool.AddSerializedFile(serialized_pb)
 TypeError: expected bytes, str found
 ```

-Possible workaround is to upgrade default protobuf compiler (libprotoc 2.5.0) to newer version, for example
-libprotoc 2.6.1.
+A possible workaround is to upgrade default protobuf compiler (libprotoc 2.5.0) to newer version, for example libprotoc 2.6.1.

 [protobuf_issue]: https://github.com/google/protobuf/issues/4272

 ## Dynamic batching
-Refer to the **Limitations** section of [Dynamic batching page](DynamicBatching.md)
+Refer to the **Limitations** section of the [Dynamic batching page](DynamicBatching.md).

 ## Static Shape Infer
-Refer to the **Limitations** section of [Static Shape Infer page](ShapeInference.md)
+Refer to the **Limitations** section of the [Static Shape Infer page](ShapeInference.md).


 ## Image Pre-Processing Performance Optimization Issue

-As described in [documentation for new API](Integrate_with_customer_application_new_API.md), you can set an image blob of any size to an
-infer request using resizable input. Resize is executed during inference using configured resize algorithm.
+As described in [documentation for the new API](Integrate_with_customer_application_new_API.md), you can set an image blob of any size to an
+infer request using resizable input. Resize is executed during inference using the configured resize algorithm.

-But currently resize algorithms are not completely optimized. So expect performance degradation if resizable input is
-specified and an input blob (to be resized) is set (`SetBlob()` is used). Required performance is met for
+But currently, resize algorithms are not completely optimized. So expect performance degradation if resizable input is
+specified and an input blob (to be resized) is set using `SetBlob()`. The best performance is for the 
 [CPU](supported_plugins/CPU.md) plugin only (because enabled openMP* provides parallelism).

 Another limitation is that currently, resize algorithms support NCHW layout only. So if you set NHWC layout for an input
--- a/docs/IE_DG/Legal_Information.md
+++ b/docs/IE_DG/Legal_Information.md
@@ -1,12 +0,0 @@
-# Legal Information {#openvino_docs_IE_DG_Legal_Information}
-
-<sup>No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document.</sup><br/>
-<sup>Intel disclaims all express and implied warranties, including without limitation, the implied warranties of merchantability, fitness for a particular purpose, and non-infringement, as well as any warranty arising from course of performance, course of dealing, or usage in trade.</sup><br/>
-<sup>This document contains information on products, services and/or processes in development. All information provided here is subject to change without notice. Contact your Intel representative to obtain the latest forecast, schedule, specifications and roadmaps.</sup><br/>
-<sup>The products and services described may contain defects or errors known as errata which may cause deviations from published specifications. Current characterized errata are available on request.</sup><br/>
-<sup>Copies of documents which have an order number and are referenced in this document may be obtained by calling 1-800-548-4725 or by visiting [<b>www.intel.com/design/literature.htm</b>](http://www.intel.com/design/literature.htm).</sup><br/>
-<sup>Intel, Intel logo, Intel Core, VTune, Xeon are trademarks of Intel Corporation in the U.S. and other countries.</sup><br/>
-<sup>\* Other names and brands may be claimed as the property of others.</sup><br/>
-<sup>Copyright © 2016-2018 Intel Corporation.</sup><br/>
-<sup>This software and the related documents are Intel copyrighted materials, and your use of them is governed by the express license under which they were provided to you (License). Unless the License provides otherwise, you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related documents without Intel's prior written permission.</sup><br/>
-<sup>This software and the related documents are provided as is, with no express or implied warranties, other than those that are expressly stated in the License.</sup><br/>
--- a/docs/IE_DG/Memory_primitives.md
+++ b/docs/IE_DG/Memory_primitives.md
@@ -1,5 +1,12 @@
-Inference Engine Memory primitives {#openvino_docs_IE_DG_Memory_primitives}
-=====================================================================
+# Inference Engine Memory Primitives {#openvino_docs_IE_DG_Memory_primitives}
+
+## Inference Memory Primitives (C++)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective

 ## Blobs

@@ -18,7 +25,7 @@ InferenceEngine::Blob::Ptr blob = InferenceEngine::make_shared_blob<float>(tdesc

 This class allows to create planar layouts using the standard formats (like <code>InferenceEngine::Layout::NCDHW</code>, <code>InferenceEngine::Layout::NCHW</code>, <code>InferenceEngine::Layout::NC</code>, <code>InferenceEngine::Layout::C</code> and etc) and also non-planar layouts using <code>InferenceEngine::BlockingDesc</code>.

-In order to create a complex layout you should use <code>InferenceEngine::BlockingDesc</code> which allows to define the blocked memory with offsets and strides.
+In order to create a complex layout you should use <code>InferenceEngine::BlockingDesc</code>, which allows you to define the blocked memory with offsets and strides.

 ## Examples

@@ -27,13 +34,12 @@ In order to create a complex layout you should use <code>InferenceEngine::Blocki
 InferenceEngine::BlockingDesc({1, 20, 20, 25}, {0, 2, 3, 1}); // or
 InferenceEngine::BlockingDesc({1, 20, 20, 25}, InferenceEngine::Layout::NHWC);
 </pre>
-2. If you have a memory with real dimensions {N: 1, C: 25, H: 20, W: 20} but with channels which are blocked by 8, you can define it using next parameters:<br/>
+2. If you have a memory with real dimensions {N: 1, C: 25, H: 20, W: 20} but with channels that are blocked by 8, you can define it using next parameters:<br/>
 <pre class="brush:cpp">
 InferenceEngine::BlockingDesc({1, 4, 20, 20, 8}, {0, 1, 2, 3, 1})
 </pre>
 3. Also you can set strides and offsets if layout contains it.
-4. If you have a complex blob layout and you don't want to calculate the real offset to data you can use methods
-<code>InferenceEngine::TensorDesc::offset(size_t l)</code> or <code>InferenceEngine::TensorDesc::offset(SizeVector v)</code>.<br/>
+4. If you have a complex blob layout and you don't want to calculate the real offset to data you can use the <code>InferenceEngine::TensorDesc::offset(size_t l)</code> or <code>InferenceEngine::TensorDesc::offset(SizeVector v)</code> methods.<br/>
 For example:
 <pre class="brush:cpp">
 InferenceEngine::BlockingDesc blk({1, 4, 20, 20, 8}, {0, 1, 2, 3, 1});
@@ -43,8 +49,7 @@ tdesc.offset(1); // = 8
 tdesc.offset({0, 0, 0, 2}); // = 16
 tdesc.offset({0, 1, 0, 2}); // = 17
 </pre>
-5. If you would like to create a TensorDesc with a planar format and for N dimensions (N can be different 1, 2, 4 and etc), you can use the method
-<code>InferenceEngine::TensorDesc::getLayoutByDims</code>.
+5. If you would like to create a TensorDesc with a planar format and for N dimensions (N can be different 1, 2, 4 and etc), you can use the <code>InferenceEngine::TensorDesc::getLayoutByDims</code> method.
 <pre class="brush:cpp">
 InferenceEngine::TensorDesc::getLayoutByDims({1}); // InferenceEngine::Layout::C
 InferenceEngine::TensorDesc::getLayoutByDims({1, 2}); // InferenceEngine::Layout::NC
@@ -52,4 +57,4 @@ InferenceEngine::TensorDesc::getLayoutByDims({1, 2, 3, 4}); // InferenceEngine::
 InferenceEngine::TensorDesc::getLayoutByDims({1, 2, 3}); // InferenceEngine::Layout::BLOCKED
 InferenceEngine::TensorDesc::getLayoutByDims({1, 2, 3, 4, 5}); // InferenceEngine::Layout::NCDHW
 InferenceEngine::TensorDesc::getLayoutByDims({1, 2, 3, 4, 5, ...}); // InferenceEngine::Layout::BLOCKED
-</pre>
+</pre>
--- a/docs/IE_DG/Model_Downloader.md
+++ b/docs/IE_DG/Model_Downloader.md
@@ -0,0 +1,446 @@
+# Model Downloader {#openvino_docs_IE_DG_Tools_Model_Downloader}
+
+
+This directory contains scripts that automate certain model-related tasks
+based on configuration files in the models' directories.
+
+* Model Downloader: `downloader.py`  downloads model files from online sources
+  and, if necessary, patches them to make them more usable with Model
+  Optimizer;
+
+* Model Converter: `converter.py` converts the models that are not in the
+  Inference Engine IR format into that format using Model Optimizer.
+
+* Model Quantizer: `quantizer.py` quantizes full-precision models in the IR
+  format into low-precision versions using Post-Training Optimization Toolkit.
+
+*  Model Information Dumper: `info_dumper.py` prints information about the models
+  in a stable machine-readable format.
+
+
+> **TIP**: You can quick start with the Model Downloader inside the OpenVINO™ Deep Learning Workbench (DL Workbench). DL Workbench is the OpenVINO™ toolkit UI that enables you to import a model, analyze its performance and accuracy, visualize the outputs, optimize and prepare the model for deployment on various Intel® platforms.
+
+## Prerequisites
+
+1. Install Python (version 3.6 or higher)
+2. Install the tools' dependencies with the following command:
+
+```sh
+python3 -mpip install --user -r ./requirements.in
+```
+
+For the model converter, you will also need to install the OpenVINO&trade;
+toolkit and the prerequisite libraries for Model Optimizer. See the
+[OpenVINO toolkit documentation](https://docs.openvinotoolkit.org/) for details.
+
+To convert models from certain frameworks, you will also need to install
+additional dependencies.
+
+@sphinxdirective
+   
+.. tab:: Caffe2
+   
+  .. code-block:: python
+   
+      python3 -mpip install --user -r ./requirements-caffe2.in
+
+.. tab:: Pytorch
+      
+  .. code-block:: python
+
+      python3 -mpip install --user -r ./requirements-pytorch.in  
+   
+.. tab:: TensorFlow
+      
+  .. code-block:: python
+         
+     python3 -mpip install --user -r ./requirements-tensorflow.in
+
+@endsphinxdirective
+
+
+## Model Downloader 
+
+The basic usage is to run the script like this:
+
+```sh
+./downloader.py --all
+
+```
+This will download all models. The `--all` option can be replaced with
+other filter options to download only a subset of models. See the "Shared options"
+section.
+
+### Model Downloader Starting Parameters
+
+@sphinxdirective
+
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+| Parameter                 | Explanation                                                                                                                                                                                                                                                                                                                                                                                      | Example                                                                             |
+===========================+==================================================================================================================================================================================================================================================================================================================================================================================================+=====================================================================================+
+| ``-o``/``--output_dir``   | By default, the script will download models into a directory tree rooted in the current directory. Use this parameter to download into a different directory.                                                                                                                                                                                                                                    | ``./downloader.py --all --output_dir my/download/directory``                        |
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+| ``--precisions``          | Specify comma separated precisions of weights to be downloaded                                                                                                                                                                                                                                                                                                                                   | ``./downloader.py --name face-detection-retail-0004 --precisions FP16,FP16-INT8``   |
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+| ``--num_attempts``        | By default, the script will attempt to download each file only once. Use this parameter to change that and increase the robustness of the download process                                                                                                                                                                                                                                       | ``./downloader.py --all --num_attempts 5 # attempt each download five times``       |
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+| ``--cache_dir``           | Make the script use the specified directory as a cache. The script will place a copy of each downloaded file in the cache, or, if it is already there, retrieve it from the cache instead of downloading it again. The cache format is intended to remain compatible in future Open Model Zoo versions, so you can use a cache to avoid redownloading most files when updating Open Model Zoo.   | ``./downloader.py --all --cache_dir my/cache/directory``                            |
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+| ``-j``/``--jobs``         | The script downloads files for multiple models concurrently.                                                                                                                                                                                                                                                                                                                                     | ``./downloader.py --all -j8 # download up to 8 models at a time``                   |
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+| ``--progress_format``     | By default, the script outputs progress information as unstructured, human-readable text. Use this option, if you want to consume progress information programmatically.                                                                                                                                                                                                                         | ``./downloader.py --all --progress_format=json``                                    |
+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+
+@endsphinxdirective
+
+When this option is set to `json`, the script's standard output is replaced by
+a machine-readable progress report, whose format is documented in the
+"JSON progress report format" section. This option does not affect errors and
+warnings, which will still be printed to the standard error stream in a
+human-readable format.
+
+You can also set this option to `text` to explicitly request the default text
+format.
+
+See the "Shared options" section for information on other options accepted by
+the script.
+
+### JSON progress report format
+
+This section documents the format of the progress report produced by the script
+when the `--progress_format=json` option is specified.
+
+The report consists of a sequence of events, where each event is represented
+by a line containing a JSON-encoded object. Each event has a member with the
+name `$type` whose value determines the type of the event, as well as which
+additional members it contains.
+
+@sphinxdirective
+
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Event type                         | Additional members                                                      | Explanation                                                                                                                                                                                                                                                                                                                                    |
+====================================+=========================================================================+================================================================================================================================================================================================================================================================================================================================================+
+| ``model_download_begin``           | ``model`` (string), ``num_files`` (integer)                             | The script started downloading the model named by ``model``. ``num_files`` is the number of files that will be downloaded for this model. This event will always be followed by a corresponding ``model_download_end`` event.                                                                                                                  |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``model_download_end``             | ``model`` (string), ``successful`` (boolean)                            | The script stopped downloading the model named by ``model``. ``successful`` is true if every file was downloaded successfully.                                                                                                                                                                                                                 |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``model_file_download_begin``      | ``model`` (string), ``model_file`` (string), ``size`` (integer)         | The script started downloading the file named by ``model_file`` of the model named by ``model``. ``size`` is the size of the file in bytes. This event will always occur between ``model_download_begin`` and ``model_download_end`` events for the model, and will always be followed by a corresponding ``model_file_download_end`` event.   |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``model_file_download_end``        | ``model`` (string), ``model_file`` (string), ``successful`` (boolean)   | The script stopped downloading the file named by ``model_file`` of the model named by ``model``. ``successful`` is true if the file was downloaded successfully.                                                                                                                                                                               |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``model_file_download_progress``   | ``model`` (string), ``model_file`` (string), ``size`` (integer)         | The script downloaded ``size`` bytes of the file named by ``model_file`` of the model named by ``model`` so far. Note that ``size`` can decrease in a subsequent event if the download is interrupted and retried. This event will always occur between ``model_file_download_begin`` and ``model_file_download_end`` events for the file.     |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``model_postprocessing_begin``     | ``model``                                                               | The script started post-download processing on the model named by ``model``. This event will always be followed by a corresponding ``model_postprocessing_end`` event.                                                                                                                                                                         |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``model_postprocessing_end``       | ``model``                                                               | The script stopped post-download processing on the model named by ``model``.                                                                                                                                                                                                                                                                   |
+------------------------------------+-------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+@endsphinxdirective
+
+
+Additional event types and members may be added in the future.
+
+Tools parsing the machine-readable format should avoid relying on undocumented details.
+In particular:
+
+* Tools should not assume that any given event will occur for a given model/file
+  (unless specified otherwise above) or will only occur once.
+
+* Tools should not assume that events will occur in a certain order beyond
+  the ordering constraints specified above. In particular, when the `--jobs` option
+  is set to a value greater than 1, event sequences for different files or models
+  may get interleaved.
+
+## Model Converter 
+
+The basic usage is to run the script like this:
+
+```sh
+./converter.py --all
+```
+
+This will convert all models into the Inference Engine IR format. Models that
+were originally in that format are ignored. Models in PyTorch and Caffe2 formats will be
+converted in ONNX format first.
+
+The `--all` option can be replaced with other filter options to convert only
+a subset of models. See the "Shared options" section.
+
+### Model Converter Starting Parameters
+
+@sphinxdirective
+
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| Parameter                   | Explanation                                                                                                                                                                                                                                                      | Example                                                                                          |
+=============================+==================================================================================================================================================================================================================================================================+==================================================================================================+
+| ``-d``/``--download_dir``   | The current directory must be the root of a download tree created by the model downloader. Use this parameter to specify a different download tree path.                                                                                                         | ``./converter.py --all --download_dir my/download/directory``                                    |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| ``-o``/``--output_dir``     | By default, the script will download models into a directory tree rooted in the current directory. Use this parameter to download into a different directory. Note: models in intermediate format are placed to this directory too.                              | ``./converter.py --all --output_dir my/output/directory``                                        |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| ``--precisions``            | By default, the script will produce models in every precision that is supported for conversion. Use this parameter to only produce models in a specific precision. If the specified precision is not supported for a model, that model will be skipped.          | ``./converter.py --all --precisions=FP16``                                                       |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| ``--add_mo_arg``            | Add extra Model Optimizer arguments to the ones specified in the model configuration. The option can be repeated to add multiple arguments                                                                                                                       | ``./converter.py --name=caffenet --add_mo_arg=--reverse_input_channels --add_mo_arg=--silent``   |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| ``-j``/``--jobs``           | Run multiple conversion commands concurrently. The argument to the option must be either a maximum number of concurrently executed commands, or "auto", in which case the number of CPUs in the system is used. By default, all commands are run sequentially.   | ``./converter.py --all -j8 # run up to 8 commands at a time``                                    |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| ``--dry_run``               | Print the conversion commands without actually running them..                                                                                                                                                                                                    | ``./converter.py --all --dry_run``                                                               |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+| ``-p``/``--python``         | By default, the script will run Model Optimizer using the same Python executable that was used to run the script itself. Apply this parameter to use a different Python executable.                                                                              | ``./converter.py --all --python my/python``                                                      |
+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
+@endsphinxdirective
+
+
+The Python script will attempt to locate Model Optimizer using several methods:
+
+1. If the `--mo` option was specified, then its value will be used as the path
+   to the script to run:
+
+   ```sh
+   ./converter.py --all --mo my/openvino/path/model_optimizer/mo.py
+   ```
+
+2. Otherwise, if the selected Python executable can import the `mo` package,
+   then that package will be used.
+
+3. Otherwise, if the OpenVINO&trade; toolkit's `setupvars.sh`/`setupvars.bat`
+   script has been executed, the environment variables set by that script will
+   be used to locate Model Optimizer within the toolkit.
+
+4. Otherwise, the script will fail.
+
+
+See the "Shared options" section for information on other options accepted by
+the script.
+
+## Model Quantizer 
+
+Before you run the model quantizer, you must prepare a directory with
+the datasets required for the quantization process. This directory will be
+referred to as `<DATASET_DIR>` below. You can find more detailed information
+about dataset preparation in the [Dataset Preparation Guide](../../data/datasets.md).
+
+The basic usage is to run the script like this:
+
+```sh
+./quantizer.py --all --dataset_dir <DATASET_DIR>
+```
+
+This will quantize all models for which quantization is supported. Other models
+are ignored.
+
+The `--all` option can be replaced with other filter options to quantize only
+a subset of models. See the "Shared options" section.
+
+### Model Quantizer Starting Parameters
+
+@sphinxdirective
+
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+| Parameter                 | Explanation                                                                                                                                                                                                                                                                                                         | Example                                                                                 |
+===========================+=====================================================================================================================================================================================================================================================================================================================+=========================================================================================+
+| ``--model_dir``           | The current directory must be the root of a tree of model files create by the model converter. Use this parameter to specify a different model tree path                                                                                                                                                            | ``./quantizer.py --all --dataset_dir <DATASET_DIR> --model_dir my/model/directory``     |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+| ``-o``/``--output_dir``   | By default, the script will download models into a directory tree rooted in the current directory. Use this parameter to download into a different directory. Note: models in intermediate format are placed to this directory too.                                                                                 | ``./quantizer.py --all --dataset_dir <DATASET_DIR> --output_dir my/output/directory``   |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+| ``--precisions``          | By default, the script will produce models in every precision that is supported as a quantization output. Use this parameter to only produce models in a specific precision.                                                                                                                                        | ``./quantizer.py --all --dataset_dir <DATASET_DIR> --precisions=FP16-INT8``             |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+| ``--target_device``       | It's possible to specify a target device for Post-Training Optimization Toolkitto optimize for. The supported values are those accepted by the "target\_device" option in Post-Training Optimization Toolkit's config files. If this option is unspecified, Post-Training Optimization Toolkit's default is used.   | ``../quantizer.py --all --dataset_dir <DATASET_DIR> --target_device VPU``               |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+| ``--dry_run``             | The script can print the quantization commands without actually running them. With this option specified, the configuration file for Post-Training Optimization Toolkit will still be created, so that you can inspect it.                                                                                          | ``./quantizer.py --all --dataset_dir <DATASET_DIR> --dry_run``                          |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+| ``-p``/``--python``       | By default, the script will run Model Optimizer using the same Python executable that was used to run the script itself. Apply this parameter to use a different Python executable.                                                                                                                                 | ``./quantizer.py --all --dataset_dir <DATASET_DIR> --python my/python``                 |
+---------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+
+@endsphinxdirective
+
+
+The script will attempt to locate Post-Training Optimization Toolkit using several methods:
+
+1. If the `--pot` option was specified, then its value will be used as the path
+   to the script to run:
+
+   ```sh
+   ./quantizer.py --all --dataset_dir <DATASET_DIR> --pot my/openvino/path/post_training_optimization_toolkit/main.py
+   ```
+
+2. Otherwise, if the selected Python executable can import the `pot` package,
+   then that package will be used.
+
+3. Otherwise, if the OpenVINO&trade; toolkit's `setupvars.sh`/`setupvars.bat`
+   script has been executed, the environment variables set by that script will
+   be used to locate Post-Training Optimization Toolkit within the OpenVINO toolkit.
+
+4. Otherwise, the script will fail.
+
+
+See the "Shared options" section for information on other options accepted by
+the script.
+
+## Model Information Dumper 
+
+The basic usage is to run the script like this:
+
+```sh
+./info_dumper.py --all
+```
+
+This will print to standard output information about all models.
+
+The only options accepted by the script are those described in the "Shared options"
+section.
+
+The script's output is a JSON array, each element of which is a JSON object
+describing a single model. Each such object has the following keys:
+
+@sphinxdirective
+
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Parameter                            | Explanation                                                                                                                                                                                                                                                                         |
+======================================+=====================================================================================================================================================================================================================================================================================+
+| ``name``                             | the identifier of the model, as accepted by the ``--name`` option.                                                                                                                                                                                                                  |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``composite_model_name``             | the identifier of the composite model name, if the model is a part of composition of several models (e.g. encoder-decoder), otherwise - ``null``                                                                                                                                    |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``description``                      | text describing the model. Paragraphs are separated by line feed characters.                                                                                                                                                                                                        |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``framework``                        | a string identifying the framework whose format the model is downloaded in. Current possible values are ``dldt`` (Inference Engine IR), ``caffe``, ``caffe2``, ``mxnet``, ``onnx``, ``pytorch`` and ``tf`` (TensorFlow). Additional possible values might be added in the future.   |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``license_url``                      | a URL for the license that the model is distributed under.                                                                                                                                                                                                                          |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``quantization_output_precisions``   | the list of precisions that the model can be quantized to by the model quantizer. Current possible values are ``FP16-INT8`` and ``FP32-INT8``; additional possible values might be added in the future.                                                                             |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``quantization_output_precisions``   | the list of precisions that the model can be quantized to by the model quantizer. Current possible values are ``FP16-INT8`` and ``FP32-INT8``; additional possible values might be added in the future.                                                                             |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| ``subdirectory``                     | the subdirectory of the output tree into which the downloaded or converted files will be placed by the downloader or the converter, respectively.                                                                                                                                   |
+--------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+@endsphinxdirective
+
+
+* `precisions`: the list of precisions that the model has IR files for. For models downloaded
+  in a format other than the Inference Engine IR format, these are the precisions that the model
+  converter can produce IR files in. Current possible values are:
+
+  * `FP16`
+  * `FP16-INT1`
+  * `FP16-INT8`
+  * `FP32`
+  * `FP32-INT1`
+  * `FP32-INT8`
+
+  Additional possible values might be added in the future.
+
+
+* `task_type`: a string identifying the type of task that the model performs. 
+  are:
+
+@sphinxdirective
+
+.. raw:: html
+
+   <div class="collapsible-section" data-title="Current possible values">
+
+@endsphinxdirective
+
+
+
+  * `action_recognition`
+  * `classification`
+  * `colorization`
+  * `detection`
+  * `face_recognition`
+  * `feature_extraction`
+  * `head_pose_estimation`
+  * `human_pose_estimation`
+  * `image_inpainting`
+  * `image_processing`
+  * `image_translation`
+  * `instance_segmentation`
+  * `machine_translation`
+  * `monocular_depth_estimation`
+  * `named_entity_recognition`
+  * `noise_suppression`
+  * `object_attributes`
+  * `optical_character_recognition`
+  * `place_recognition`
+  * `question_answering`
+  * `salient_object_detection`
+  * `semantic_segmentation`
+  * `sound_classification`
+  * `speech_recognition`
+  * `style_transfer`
+  * `text_to_speech`
+  * `time_series`
+  * `token_recognition`
+
+@sphinxdirective
+
+.. raw:: html
+
+   </div>
+
+@endsphinxdirective
+
+
+  Additional possible values might be added in the future.
+
+## Shared options
+
+The are certain options that all tools accept.
+
+`-h`/`--help` can be used to print a help message:
+
+```sh
+./TOOL.py --help
+```
+There are several mutually exclusive filter options that select the models the
+tool will process:
+
+@sphinxdirective
+
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+
+| Parameter    | Explanation                                                                                                                                                                                                                                                                       | Example                                   |
+==============+===================================================================================================================================================================================================================================================================================+===========================================+
+| ``--all``    | Selects all models                                                                                                                                                                                                                                                                | ``./TOOL.py --all``                       |
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+
+| ``--name``   | takes a comma-separated list of patterns and selects models that match at least one of these patterns. The patterns may contain shell-style wildcards. For composite models, the name of composite model is accepted, as well as the names of individual models it consists of.   | ``./TOOL.py --name 'mtcnn,densenet-*'``   |
+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------+
+@endsphinxdirective
+
+
+ See https://docs.python.org/3/library/fnmatch.html for a full description of
+the pattern syntax.
+
+ `--list` takes a path to a file that must contain a list of patterns and
+  selects models that match at least one of those patterns.
+  For composite models, the name of composite model is accepted, as well as the names
+  of individual models it consists of.
+
+  ```sh
+  ./TOOL.py --list my.lst
+  ```
+
+  The file must contain one pattern per line. The pattern syntax is the same
+  as for the `--name` option. Blank lines and comments starting with `#` are
+  ignored. For example:
+
+  ```
+  mtcnn # get all three models: mtcnn-o, mtcnn-p, mtcnn-r
+  densenet-* # get all DenseNet variants
+  ```
+
+To see the available models, you can use the `--print_all` option. When this
+option is specified, the tool will print all model names defined in the
+configuration file and exit:
+
+```
+$ ./TOOL.py --print_all
+action-recognition-0001-decoder
+action-recognition-0001-encoder
+age-gender-recognition-retail-0013
+driver-action-recognition-adas-0002-decoder
+driver-action-recognition-adas-0002-encoder
+emotions-recognition-retail-0003
+face-detection-adas-0001
+face-detection-retail-0004
+face-detection-retail-0005
+[...]
+```
+
+Either `--print_all` or one of the filter options must be specified.
--- a/docs/IE_DG/Model_caching_overview.md
+++ b/docs/IE_DG/Model_caching_overview.md
@@ -0,0 +1,136 @@
+# Model Caching Overview {#openvino_docs_IE_DG_Model_caching_overview}
+
+## Introduction (C++)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective
+
+As described in the [Inference Engine Developer Guide](Deep_Learning_Inference_Engine_DevGuide.md), a common application flow consists of the following steps:
+
+1. **Create an Inference Engine Core object**: First step to manage available devices and read network objects
+
+2. **Read the Intermediate Representation**: Read an Intermediate Representation file into an object of the `InferenceEngine::CNNNetwork`
+
+3. **Prepare inputs and outputs**: If needed, manipulate precision, memory layout, size or color format
+
+4. **Set configuration**: Pass device-specific loading configurations to the device
+
+5. **Compile and Load Network to device**: Use the `InferenceEngine::Core::LoadNetwork()` method with a specific device
+
+6. **Set input data**: Specify input blob
+
+7. **Execute**: Carry out inference and process results
+
+Step 5 can potentially perform several time-consuming device-specific optimizations and network compilations,
+and such delays can lead to a bad user experience on application startup. To avoid this, some devices offer
+import/export network capability, and it is possible to either use the [Compile tool](../../inference-engine/tools/compile_tool/README.md)
+or enable model caching to export compiled network automatically. Reusing cached networks can significantly reduce load network time.
+
+### Set "CACHE_DIR" config option to enable model caching
+
+To enable model caching, the application must specify a folder to store cached blobs, which is done like this:
+
+@snippet snippets/InferenceEngine_Caching0.cpp part0
+
+With this code, if the device specified by `LoadNetwork` supports import/export network capability, a cached blob is automatically created inside the `myCacheFolder` folder.
+CACHE_DIR config is set to the Core object. If the device does not support import/export capability, cache is not created and no error is thrown.
+
+Depending on your device, total time for loading network on application startup can be significantly reduced.
+Also note that the very first LoadNetwork (when cache is not yet created) takes slightly longer time to "export" the compiled blob into a cache file:
+
+![caching_enabled]
+
+### Even faster: use LoadNetwork(modelPath)
+
+In some cases, applications do not need to customize inputs and outputs every time. Such an application always
+call `cnnNet = ie.ReadNetwork(...)`, then `ie.LoadNetwork(cnnNet, ..)` and it can be further optimized.
+For these cases, the 2021.4 release introduces a more convenient API to load the network in a single call, skipping the export step:
+
+@snippet snippets/InferenceEngine_Caching1.cpp part1
+
+With model caching enabled, total load time is even smaller, if ReadNetwork is optimized as well.
+
+@snippet snippets/InferenceEngine_Caching2.cpp part2
+
+![caching_times]
+
+### Advanced Examples
+
+Not every device supports network import/export capability. For those that don't, enabling caching has no effect.
+To check in advance if a particular device supports model caching, your application can use the following code:
+
+@snippet snippets/InferenceEngine_Caching3.cpp part3
+
+## Introduction (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+As described in Inference Engine Developer Guide, a common application flow consists of the following steps:
+
+1. **Create an Inference Engine Core Object**
+2. **Read the Intermediate Representation** - Read an Intermediate Representation file into an object of the [ie_api.IENetwork](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html)
+3. **Prepare inputs and outputs**
+4. **Set configuration** - Pass device-specific loading configurations to the device
+5. **Compile and Load Network to device** - Use the `IECore.load_network()` method and specify the target device
+6. **Set input data**
+7. **Execute the model** - Run inference
+
+Step #5 can potentially perform several time-consuming device-specific optimizations and network compilations, and such delays can lead to bad user experience on application startup. To avoid this, some devices offer Import/Export network capability, and it is possible to either use the [Compile tool](../../inference-engine/tools/compile_tool/README.md) or enable model caching to export the compiled network automatically. Reusing cached networks can significantly reduce load network time.
+
+### Set the “CACHE_DIR” config option to enable model caching
+
+To enable model caching, the application must specify the folder where to store cached blobs. It can be done using [IECore.set_config](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.set_config).
+
+``` python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.set_config(config={"CACHE_DIR": path_to_cache}, device_name=device)
+net = ie.read_network(model=path_to_xml_file)
+exec_net = ie.load_network(network=net, device_name=device)
+```
+
+With this code, if a device supports the Import/Export network capability, a cached blob is automatically created inside the path_to_cache directory `CACHE_DIR` config is set to the Core object. If device does not support Import/Export capability, cache is just not created and no error is thrown
+
+Depending on your device, total time for loading network on application startup can be significantly reduced. Please also note that very first [IECore.load_network](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.load_network) (when the cache is not yet created) takes slightly longer time to ‘export’ the compiled blob into a cache file.
+
+![caching_enabled]
+
+
+### Even Faster: Use IECore.load_network(path_to_xml_file)
+
+In some cases, applications do not need to customize inputs and outputs every time. These applications always call [IECore.read_network](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.read_network), then `IECore.load_network(model=path_to_xml_file)` and may be further optimized. For such cases, it's more convenient to load the network in a single call to `ie.load_network()`
+A model can be loaded directly to the device, with model caching enabled:
+
+``` python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.set_config(config={"CACHE_DIR" : path_to_cache}, device_name=device)
+ie.load_network(network=path_to_xml_file, device_name=device)
+```
+
+![caching_times]
+
+### Advanced Examples
+
+Not every device supports network import/export capability, enabling of caching for such devices does not have any effect. To check in advance if a particular device supports model caching, your application can use the following code:
+
+```python
+all_metrics = ie.get_metric(device_name=device, metric_name="SUPPORTED_METRICS")
+# Find the 'IMPORT_EXPORT_SUPPORT' metric in supported metrics
+allows_caching = "IMPORT_EXPORT_SUPPORT" in all_metrics
+```
+
+> **NOTE**: The GPU plugin does not have the IMPORT_EXPORT_SUPPORT capability, and does not support model caching yet. However, the GPU plugin supports caching kernels (see the [GPU plugin documentation](supported_plugins/GPU.md)). Kernel caching for the GPU plugin can be accessed the same way as model caching: by setting the `CACHE_DIR` configuration key to a folder where the cache should be stored.
+
+
+[caching_enabled]: ../img/caching_enabled.png
+[caching_times]: ../img/caching_times.png
--- a/docs/IE_DG/ONNX_Support.md
+++ b/docs/IE_DG/ONNX_Support.md
@@ -1,41 +1,37 @@
-# ONNX format support in the OpenVINO™ {#openvino_docs_IE_DG_ONNX_Support}
+# ONNX Format Support {#openvino_docs_IE_DG_ONNX_Support}

-Starting from the 2020.4 release, OpenVINO™ supports reading native ONNX models.
-`Core::ReadNetwork()` method provides a uniform way to read models from IR or ONNX format, it is a recommended approach to reading models.
-Example:
+## Introduction (C++)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective
+
+Starting with the 2020.4 release, OpenVINO™ supports reading native ONNX models. The `Core::ReadNetwork()` method provides a uniform way to read models from IR or ONNX format, it is a recommended approach to reading models. Example:

 ```cpp
 InferenceEngine::Core core;
 auto network = core.ReadNetwork("model.onnx");
 ```

-**Reshape feature:**
+### Reshape Feature
+OpenVINO™ does not provide a mechanism to specify pre-processing (like mean values subtraction, reverse input channels) for the ONNX format. If an ONNX model contains dynamic shapes for input, please use the `CNNNetwork::reshape` method to reshape the model.

-OpenVINO™ doesn't provide a mechanism to specify pre-processing (like mean values subtraction, reverse input channels) for the ONNX format.
-If an ONNX model contains dynamic shapes for input, please use the `CNNNetwork::reshape` method for shape specialization.
+### Weights Saved in External Files

-**Weights saved in external files:**
-
-OpenVINO™ supports ONNX models that store weights in external files. It is especially useful for models larger than 2GB because of protobuf limitations.
-To read such models, use the `ReadNetwork` overload which takes `modelPath` as input parameter (both `std::string` and `std::wstring`).
-Note that the `binPath` argument of `ReadNetwork` should be empty in this case, because paths to external weights are saved directly in an ONNX model.
-Otherwise, a runtime exception is thrown.
-Reading models with external weights is not supported by the `ReadNetwork(const std::string& model, const Blob::CPtr& weights)` overload.
+OpenVINO™ supports ONNX models that store weights in external files. It is especially useful for models larger than 2GB because of protobuf limitations. To read such models, use the `ReadNetwork` overload which takes `modelPath` as input parameter (both `std::string` and `std::wstring`). Note that the `binPath` argument of `ReadNetwork` should be empty in this case, because paths to external weights are saved directly in an ONNX model.
+Otherwise, a runtime exception is thrown. Reading models with external weights is not supported by the `ReadNetwork(const std::string& model, const Blob::CPtr& weights)` overload.

 Paths to external weight files are saved in an ONNX model; these paths are relative to the model's directory path.
-It means that if a model is located at:
-`home/user/workspace/models/model.onnx`
-and a file that contains external weights:
-`home/user/workspace/models/data/weights.bin`
-the path saved in model should be:
-`data/weights.bin`.
+It means that if a model is located at `home/user/workspace/models/model.onnx` and a file that contains external weights is in `home/user/workspace/models/data/weights.bin`, then the path saved in the model should be:
+  `data/weights.bin`

-**NOTE**
+> **NOTE**: A single model can use many external weights files.

-* A single model can use many external weights files.
-* Data of many tensors can be stored in a single external weights file (it is processed using offset and length values, which can be also saved in a model).
+> **NOTE**: Data of many tensors can be stored in a single external weights file (it is processed using offset and length values, which can be also saved in a model).

-The described mechanism is the only possibility to read weights from external files. The following input parameters of the `ReadNetwork` function overloads are NOT supported for ONNX models and should be passed as empty:
+The described mechanism is the only way to read weights from external files. The following input parameters of the `ReadNetwork` function overloads are NOT supported for ONNX models and should be passed as empty:
 * `const std::wstring& binPath`
 * `const std::string& binPath`
 * `const Blob::CPtr& weights`
@@ -43,8 +39,53 @@ The described mechanism is the only possibility to read weights from external fi
 You can find more details about the external data mechanism in [ONNX documentation](https://github.com/onnx/onnx/blob/master/docs/ExternalData.md).
 To convert a model to use the external data feature, you can use [ONNX helper functions](https://github.com/onnx/onnx/blob/master/onnx/external_data_helper.py).

-**Unsupported types of tensors:**
+Unsupported types of tensors:
+* string
+* complex64
+* complex128

-* `string`,
-* `complex64`,
-* `complex128`.
+## Introduction (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+Starting with the 2020.4 release, OpenVINO™ supports reading native ONNX models. The `IECore.read_network()` method provides a uniform way to read models from IR or ONNX format, it is a recommended approach to reading models. Example:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_onnx_file)
+```
+
+### Reshape Feature
+OpenVINO™ does not provide a mechanism to specify pre-processing (like mean values subtraction, reverse input channels) for the ONNX format. If an ONNX model contains dynamic shapes for input, please use the [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape) method to reshape the model.
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_onnx_file)
+input_layer = next(iter(net.input_info))
+net.reshape({input_layer: new_shape})
+```
+
+### Weights Saved in External Files
+
+OpenVINO™ supports ONNX models that store weights in external files. It is especially useful for models larger than 2GB because of protobuf limitations. To read such models, use the `model` parameter in the `IECore.read_network(model=path_to_onnx_file)` method. Note that the parameter for the path to the binary weight file, `weights=` should be empty in this case, because paths to external weights are saved directly in an ONNX model. Otherwise, a runtime exception is thrown. Reading models with external weights is **NOT** supported by the `read_network(weights=path_to_bin_file)` parameter.
+
+Paths to external weight files are saved in an ONNX model; these paths are relative to the model’s directory path. It means that if a model is located at: `$HOME/workspace/models/model.onnx` and a file that contains external weights: `$HOME/workspace/models/data/weights.bin`, the path saved in model should be: data/weights.bin.
+
+**NOTE**: 
+* A single model can use many external weights files.
+* Data of many tensors can be stored in a single external weights file (it is processed using offset and length values, which can be also saved in a model).
+
+The described mechanism is the only possibility to read weights from external files. The `weights` input parameter of the [IECore.read_network](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.read_network) function is NOT supported for ONNX models and should not be passed, or set as None.
+
+Unsupported types of tensors:
+* string
+* complex64
+* complex128
--- a/docs/IE_DG/OnnxImporterTutorial.md
+++ b/docs/IE_DG/OnnxImporterTutorial.md
@@ -1,4 +1,4 @@
-# ONNX* Importer API Tutorial {#openvino_docs_IE_DG_OnnxImporterTutorial}
+# [DEPRECATED] ONNX* Importer API Tutorial {#openvino_docs_IE_DG_OnnxImporterTutorial}

 > **NOTE**: This tutorial is deprecated. Since OpenVINO™ 2020.4 version, Inference Engine enables reading ONNX models via the Inference Engine Core API
 > and there is no need to use directly the low-level ONNX* Importer API anymore. 
--- a/docs/IE_DG/Operations_specifications.md
+++ b/docs/IE_DG/Operations_specifications.md
@@ -0,0 +1,186 @@
+# Operations Specifications {#openvino_docs_operations_specifications}
+
+@sphinxdirective
+
+.. toctree::
+   :maxdepth: 1
+   
+   openvino_docs_ops_arithmetic_Abs_1
+   openvino_docs_ops_arithmetic_Acos_1
+   openvino_docs_ops_arithmetic_Acosh_3
+   openvino_docs_ops_pooling_AdaptiveAvgPool_8
+   openvino_docs_ops_pooling_AdaptiveMaxPool_8
+   openvino_docs_ops_arithmetic_Add_1
+   openvino_docs_ops_arithmetic_Asin_1
+   openvino_docs_ops_arithmetic_Asinh_3
+   openvino_docs_ops_infrastructure_Assign_3
+   openvino_docs_ops_arithmetic_Atan_1
+   openvino_docs_ops_arithmetic_Atanh_3
+   openvino_docs_ops_pooling_AvgPool_1
+   openvino_docs_ops_normalization_BatchNormInference_1
+   openvino_docs_ops_normalization_BatchNormInference_5
+   openvino_docs_ops_movement_BatchToSpace_2
+   openvino_docs_ops_convolution_BinaryConvolution_1
+   openvino_docs_ops_movement_Broadcast_1
+   openvino_docs_ops_movement_Broadcast_3
+   openvino_docs_ops_condition_Bucketize_3
+   openvino_docs_ops_sequence_CTCGreedyDecoder_1
+   openvino_docs_ops_sequence_CTCGreedyDecoderSeqLen_6
+   openvino_docs_ops_arithmetic_Ceiling_1
+   openvino_docs_ops_activation_Clamp_1
+   openvino_docs_ops_movement_Concat_1
+   openvino_docs_ops_infrastructure_Constant_1
+   openvino_docs_ops_type_ConvertLike_1
+   openvino_docs_ops_type_Convert_1
+   openvino_docs_ops_convolution_ConvolutionBackpropData_1
+   openvino_docs_ops_convolution_Convolution_1
+   openvino_docs_ops_arithmetic_Cos_1
+   openvino_docs_ops_arithmetic_Cosh_1
+   openvino_docs_ops_sequence_CTCLoss_4
+   openvino_docs_ops_arithmetic_CumSum_3
+   openvino_docs_ops_convolution_DeformableConvolution_8
+   openvino_docs_ops_detection_DeformablePSROIPooling_1
+   openvino_docs_ops_movement_DepthToSpace_1
+   openvino_docs_ops_detection_DetectionOutput_1
+   openvino_docs_ops_signals_DFT_7
+   openvino_docs_ops_arithmetic_Divide_1
+   openvino_docs_ops_matrix_Einsum_7
+   openvino_docs_ops_activation_Elu_1
+   openvino_docs_ops_sparse_EmbeddingBagOffsetsSum_3
+   openvino_docs_ops_sparse_EmbeddingBagPackedSum_3
+   openvino_docs_ops_sparse_EmbeddingSegmentsSum_3
+   openvino_docs_ops_comparison_Equal_1
+   openvino_docs_ops_arithmetic_Erf_1
+   openvino_docs_ops_activation_Exp_1
+   openvino_docs_ops_detection_ExperimentalDetectronDetectionOutput_6
+   openvino_docs_ops_detection_ExperimentalDetectronGenerateProposalsSingleImage_6
+   openvino_docs_ops_detection_ExperimentalDetectronPriorGridGenerator_6
+   openvino_docs_ops_detection_ExperimentalDetectronROIFeatureExtractor_6
+   openvino_docs_ops_sort_ExperimentalDetectronTopKROIs_6
+   openvino_docs_ops_movement_ExtractImagePatches_3
+   openvino_docs_ops_quantization_FakeQuantize_1
+   openvino_docs_ops_arithmetic_FloorMod_1
+   openvino_docs_ops_arithmetic_Floor_1
+   openvino_docs_ops_normalization_GRN_1
+   openvino_docs_ops_sequence_GRUCell_3
+   openvino_docs_ops_sequence_GRUSequence_5
+   openvino_docs_ops_movement_GatherTree_1
+   openvino_docs_ops_movement_Gather_1
+   openvino_docs_ops_movement_Gather_7
+   openvino_docs_ops_movement_Gather_8
+   openvino_docs_ops_movement_GatherElements_6
+   openvino_docs_ops_movement_GatherND_5
+   openvino_docs_ops_activation_GELU_2
+   openvino_docs_ops_activation_GELU_7
+   openvino_docs_ops_comparison_GreaterEqual_1
+   openvino_docs_ops_comparison_Greater_1
+   openvino_docs_ops_convolution_GroupConvolutionBackpropData_1
+   openvino_docs_ops_convolution_GroupConvolution_1
+   openvino_docs_ops_activation_HardSigmoid_1
+   openvino_docs_ops_activation_HSigmoid_5
+   openvino_docs_ops_activation_HSwish_4
+   openvino_docs_ops_signals_IDFT_7
+   openvino_docs_ops_condition_If_8
+   openvino_docs_ops_image_Interpolate_1
+   openvino_docs_ops_image_Interpolate_4
+   openvino_docs_ops_normalization_LRN_1
+   openvino_docs_ops_sequence_LSTMCell_1
+   openvino_docs_ops_sequence_LSTMSequence_1
+   openvino_docs_ops_comparison_LessEqual_1
+   openvino_docs_ops_comparison_Less_1
+   openvino_docs_ops_arithmetic_Log_1
+   openvino_docs_ops_logical_LogicalAnd_1
+   openvino_docs_ops_logical_LogicalNot_1
+   openvino_docs_ops_logical_LogicalOr_1
+   openvino_docs_ops_logical_LogicalXor_1
+   openvino_docs_ops_activation_LogSoftmax_5
+   openvino_docs_ops_infrastructure_Loop_5
+   openvino_docs_ops_normalization_MVN_1
+   openvino_docs_ops_normalization_MVN_6
+   openvino_docs_ops_matrix_MatMul_1
+   openvino_docs_ops_sort_MatrixNonMaxSuppression_8
+   openvino_docs_ops_pooling_MaxPool_1
+   openvino_docs_ops_arithmetic_Maximum_1
+   openvino_docs_ops_arithmetic_Minimum_1
+   openvino_docs_ops_activation_Mish_4
+   openvino_docs_ops_arithmetic_Mod_1
+   openvino_docs_ops_sort_MulticlassNonMaxSuppression_8
+   openvino_docs_ops_arithmetic_Multiply_1
+   openvino_docs_ops_arithmetic_Negative_1
+   openvino_docs_ops_sort_NonMaxSuppression_1
+   openvino_docs_ops_sort_NonMaxSuppression_3
+   openvino_docs_ops_sort_NonMaxSuppression_4
+   openvino_docs_ops_sort_NonMaxSuppression_5
+   openvino_docs_ops_condition_NonZero_3
+   openvino_docs_ops_normalization_NormalizeL2_1
+   openvino_docs_ops_comparison_NotEqual_1
+   openvino_docs_ops_sequence_OneHot_1
+   openvino_docs_ops_activation_PReLU_1
+   openvino_docs_ops_detection_PSROIPooling_1
+   openvino_docs_ops_movement_Pad_1
+   openvino_docs_ops_infrastructure_Parameter_1
+   openvino_docs_ops_arithmetic_Power_1
+   openvino_docs_ops_detection_PriorBoxClustered_1
+   openvino_docs_ops_detection_PriorBox_1
+   openvino_docs_ops_detection_Proposal_1
+   openvino_docs_ops_detection_Proposal_4
+   openvino_docs_ops_generation_RandomUniform_8
+   openvino_docs_ops_generation_Range_1
+   openvino_docs_ops_generation_Range_4
+   openvino_docs_ops_infrastructure_ReadValue_3
+   openvino_docs_ops_activation_ReLU_1
+   openvino_docs_ops_reduction_ReduceL1_4
+   openvino_docs_ops_reduction_ReduceL2_4
+   openvino_docs_ops_reduction_ReduceLogicalAnd_1
+   openvino_docs_ops_reduction_ReduceLogicalOr_1
+   openvino_docs_ops_reduction_ReduceMax_1
+   openvino_docs_ops_reduction_ReduceMean_1
+   openvino_docs_ops_reduction_ReduceMin_1
+   openvino_docs_ops_reduction_ReduceProd_1
+   openvino_docs_ops_reduction_ReduceSum_1
+   openvino_docs_ops_detection_RegionYolo_1
+   openvino_docs_ops_detection_ReorgYolo_1
+   openvino_docs_ops_shape_Reshape_1
+   openvino_docs_ops_infrastructure_Result_1
+   openvino_docs_ops_movement_Reverse_1
+   openvino_docs_ops_movement_ReverseSequence_1
+   openvino_docs_ops_sequence_RNNCell_3
+   openvino_docs_ops_sequence_RNNSequence_5
+   openvino_docs_ops_detection_ROIAlign_3
+   openvino_docs_ops_detection_ROIPooling_1
+   openvino_docs_ops_movement_Roll_7
+   openvino_docs_ops_arithmetic_Round_5
+   openvino_docs_ops_movement_ScatterElementsUpdate_3
+   openvino_docs_ops_movement_ScatterNDUpdate_3
+   openvino_docs_ops_movement_ScatterUpdate_3
+   openvino_docs_ops_condition_Select_1
+   openvino_docs_ops_activation_Selu_1
+   openvino_docs_ops_shape_ShapeOf_1
+   openvino_docs_ops_shape_ShapeOf_3
+   openvino_docs_ops_movement_ShuffleChannels_1
+   openvino_docs_ops_activation_Sigmoid_1
+   openvino_docs_ops_arithmetic_Sign_1
+   openvino_docs_ops_arithmetic_Sin_1
+   openvino_docs_ops_arithmetic_Sinh_1
+   openvino_docs_ops_activation_SoftMax_1
+   openvino_docs_ops_activation_SoftPlus_4
+   openvino_docs_ops_movement_SpaceToBatch_2
+   openvino_docs_ops_movement_SpaceToDepth_1
+   openvino_docs_ops_movement_Split_1
+   openvino_docs_ops_arithmetic_Sqrt_1
+   openvino_docs_ops_arithmetic_SquaredDifference_1
+   openvino_docs_ops_shape_Squeeze_1
+   openvino_docs_ops_movement_StridedSlice_1
+   openvino_docs_ops_arithmetic_Subtract_1
+   openvino_docs_ops_activation_Swish_4
+   openvino_docs_ops_arithmetic_Tan_1
+   openvino_docs_ops_arithmetic_Tanh_1
+   openvino_docs_ops_infrastructure_TensorIterator_1
+   openvino_docs_ops_movement_Tile_1
+   openvino_docs_ops_sort_TopK_1
+   openvino_docs_ops_sort_TopK_3
+   openvino_docs_ops_movement_Transpose_1
+   openvino_docs_ops_shape_Unsqueeze_1
+   openvino_docs_ops_movement_VariadicSplit_1
+
+@endsphinxdirective
--- a/docs/IE_DG/Samples_Overview.md
+++ b/docs/IE_DG/Samples_Overview.md
@@ -1,5 +1,36 @@
 # Inference Engine Samples {#openvino_docs_IE_DG_Samples_Overview}

+@sphinxdirective
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   
+   openvino_inference_engine_samples_classification_sample_async_README
+   openvino_inference_engine_ie_bridges_python_sample_classification_sample_async_README
+   openvino_inference_engine_samples_hello_classification_README
+   openvino_inference_engine_ie_bridges_c_samples_hello_classification_README
+   openvino_inference_engine_ie_bridges_python_sample_hello_classification_README
+   openvino_inference_engine_samples_hello_reshape_ssd_README
+   openvino_inference_engine_ie_bridges_python_sample_hello_reshape_ssd_README
+   openvino_inference_engine_samples_hello_nv12_input_classification_README
+   openvino_inference_engine_ie_bridges_c_samples_hello_nv12_input_classification_README
+   openvino_inference_engine_samples_hello_query_device_README
+   openvino_inference_engine_ie_bridges_python_sample_hello_query_device_README
+   openvino_inference_engine_samples_ngraph_function_creation_sample_README
+   openvino_inference_engine_ie_bridges_python_sample_ngraph_function_creation_sample_README
+   openvino_inference_engine_samples_object_detection_sample_ssd_README
+   openvino_inference_engine_ie_bridges_python_sample_object_detection_sample_ssd_README
+   openvino_inference_engine_ie_bridges_c_samples_object_detection_sample_ssd_README
+   openvino_inference_engine_samples_speech_sample_README
+   openvino_inference_engine_ie_bridges_python_sample_speech_sample_README
+   openvino_inference_engine_samples_style_transfer_sample_README
+   openvino_inference_engine_ie_bridges_python_sample_style_transfer_sample_README
+   openvino_inference_engine_samples_benchmark_app_README
+   openvino_inference_engine_tools_benchmark_tool_README
+
+@endsphinxdirective
+
 The Inference Engine sample applications are simple console applications that show how to utilize specific Inference Engine capabilities within an application, assist developers in executing specific tasks such as loading a model, running inference, querying specific device capabilities and etc.

 After installation of Intel® Distribution of OpenVINO™ toolkit, С, C++ and Python* sample applications are available in the following directories, respectively:
--- a/docs/IE_DG/ShapeInference.md
+++ b/docs/IE_DG/ShapeInference.md
@@ -1,27 +1,37 @@
-Using Shape Inference {#openvino_docs_IE_DG_ShapeInference}
-==========================================
+# Using the Reshape Inference Feature {#openvino_docs_IE_DG_ShapeInference}

-OpenVINO™ provides the following methods for runtime model reshaping:
+## Introduction (C++)

-* **Set a new input shape** with the `InferenceEngine::CNNNetwork::reshape` method.<br>
-   The `InferenceEngine::CNNNetwork::reshape` method updates input shapes and propagates them down to the outputs of the model through all intermediate layers. 
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective
+
+OpenVINO™ provides two methods for runtime model reshaping: setting a new input shape and setting a new batch dimension value.
+
+### Set a new input shape with the reshape() method
+
+The `InferenceEngine::CNNNetwork::reshape` method updates input shapes and propagates them down to the outputs of the model through all intermediate layers. 
   
 > **NOTES**:
 > - Starting with the 2021.1 release, the Model Optimizer converts topologies keeping shape-calculating sub-graphs by default, which enables correct shape propagation during reshaping in most cases.
 > - Older versions of IRs are not guaranteed to reshape successfully. Please regenerate them with the Model Optimizer of the latest version of OpenVINO™.<br>
 > - If an ONNX model does not have a fully defined input shape and the model was imported with the ONNX importer, reshape the model before loading it to the plugin.

-* **Set a new batch dimension value** with the `InferenceEngine::CNNNetwork::setBatchSize` method.<br>     
-   The meaning of a model batch may vary depending on the model design.
-   This method does not deduce batch placement for inputs from the model architecture.
-   It assumes that the batch is placed at the zero index in the shape for all inputs and uses the `InferenceEngine::CNNNetwork::reshape` method to propagate updated shapes through the model.
+### Set a new batch dimension value with the setBatchSize() method

-   The method transforms the model before a new shape propagation to relax a hard-coded batch dimension in the model, if any.
+The meaning of a model batch may vary depending on the model design.
+This method does not deduce batch placement for inputs from the model architecture.
+It assumes that the batch is placed at the zero index in the shape for all inputs and uses the `InferenceEngine::CNNNetwork::reshape` method to propagate updated shapes through the model.

-   Use `InferenceEngine::CNNNetwork::reshape` instead of `InferenceEngine::CNNNetwork::setBatchSize` to set new input shapes for the model in case the model has:
-   * Multiple inputs with different zero-index dimension meanings
-   * Input without a batch dimension
-   * 0D, 1D, or 3D shape
+The method transforms the model before a new shape propagation to relax a hard-coded batch dimension in the model, if any.
+
+Use `InferenceEngine::CNNNetwork::reshape` instead of `InferenceEngine::CNNNetwork::setBatchSize` to set new input shapes for the model if the model has one of the following:
+
+* Multiple inputs with different zero-index dimension meanings
+* Input without a batch dimension
+* 0D, 1D, or 3D shape

   The `InferenceEngine::CNNNetwork::setBatchSize` method is a high-level API method that wraps the `InferenceEngine::CNNNetwork::reshape` method call and works for trivial models from the batch placement standpoint.
   Use `InferenceEngine::CNNNetwork::reshape` for other models.
@@ -37,10 +47,11 @@ Inference Engine takes three kinds of a model description as an input, which are
 3. [nGraph function](../nGraph_DG/nGraph_dg.md) through the constructor of `InferenceEngine::CNNNetwork`

 `InferenceEngine::CNNNetwork` keeps an `ngraph::Function` object with the model description internally.
-The object should have fully defined input shapes to be successfully loaded to the Inference Engine plugins.
-To resolve undefined input dimensions of a model, call the `CNNNetwork::reshape` method providing new input shapes before loading to the Inference Engine plugin.
+The object should have fully-defined input shapes to be successfully loaded to Inference Engine plugins.
+To resolve undefined input dimensions of a model, call the `CNNNetwork::reshape` method to provide new input shapes before loading to the Inference Engine plugin.

 Run the following code right after `InferenceEngine::CNNNetwork` creation to explicitly check for model input names and shapes:
+
 ```cpp
 CNNNetwork network = ... // read IR / ONNX model or create from nGraph::Function explicitly
 const auto parameters = network.getFunction()->get_parameters();
@@ -55,37 +66,13 @@ To feed input data of a shape that is different from the model input shape, resh

 Once the input shape of `InferenceEngine::CNNNetwork` is set, call the `InferenceEngine::Core::LoadNetwork` method to get an `InferenceEngine::ExecutableNetwork` object for inference with updated shapes.

-There are other approaches to reshape the model during the stage of <a href="_docs_MO_DG_prepare_model_convert_model_Converting_Model_General.html#when_to_specify_input_shapes">IR generation</a> or [nGraph::Function creation](../nGraph_DG/build_function.md).
+There are other approaches to reshape the model during the stage of <a href="_docs_MO_DG_prepare_model_convert_model_Converting_Model.html#when_to_specify_input_shapes">IR generation</a> or [nGraph::Function creation](../nGraph_DG/build_function.md).

 Practically, some models are not ready to be reshaped. In this case, a new input shape cannot be set with the Model Optimizer or the `InferenceEngine::CNNNetwork::reshape` method.

-## Troubleshooting Reshape Errors
+### Usage of Reshape Method <a name="usage_of_reshape_method"></a>

-Operation semantics may impose restrictions on input shapes of the operation. 
-Shape collision during shape propagation may be a sign that a new shape does not satisfy the restrictions. 
-Changing the model input shape may result in intermediate operations shape collision.
-
-Examples of such operations:
- [Reshape](../ops/shape/Reshape_1.md) operation with a hard-coded output shape value
- [MatMul](../ops/matrix/MatMul_1.md) operation with the `Const` second input cannot be resized by spatial dimensions due to operation semantics
-
-Model structure and logic should not change significantly after model reshaping.
- The Global Pooling operation is commonly used to reduce output feature map of classification models output.
-Having the input of the shape [N, C, H, W], Global Pooling returns the output of the shape [N, C, 1, 1].
-Model architects usually express Global Pooling with the help of the `Pooling` operation with the fixed kernel size [H, W].
-During spatial reshape, having the input of the shape [N, C, H1, W1], Pooling with the fixed kernel size [H, W] returns the output of the shape [N, C, H2, W2], where H2 and W2 are commonly not equal to `1`.
-It breaks the classification model structure.
-For example, [publicly available Inception family models from TensorFlow*](https://github.com/tensorflow/models/tree/master/research/slim#pre-trained-models) have this issue.
-
- Changing the model input shape may significantly affect its accuracy.
-For example, Object Detection models from TensorFlow have resizing restrictions by design. 
-To keep the model valid after the reshape, choose a new input shape that satisfies conditions listed in the `pipeline.config` file. 
-For details, refer to the <a href="_docs_MO_DG_prepare_model_convert_model_tf_specific_Convert_Object_Detection_API_Models.html#tf_od_custom_input_shape">Tensorflow Object Detection API models resizing techniques</a>.
-
-## Usage of Reshape Method <a name="usage_of_reshape_method"></a>
-
-The primary method of the feature is `InferenceEngine::CNNNetwork::reshape`.
-It gets new input shapes and propagates it from input to output for all intermediates layers of the given network.
+The primary method of the feature is `InferenceEngine::CNNNetwork::reshape`. It gets new input shapes and propagates it from input to output for all intermediates layers of the given network.
 The method takes `InferenceEngine::ICNNNetwork::InputShapes` - a map of pairs: name of input data and its dimension.

 The algorithm for resizing network is the following:
@@ -100,9 +87,140 @@ Here is a code example:

@snippet snippets/ShapeInference.cpp part0

-Shape Inference feature is used in [Smart Classroom Demo](@ref omz_demos_smart_classroom_demo_cpp).
+The Shape Inference feature is used in [Smart Classroom Demo](@ref omz_demos_smart_classroom_demo_cpp).

-## Extensibility
+### Troubleshooting Reshape Errors

-Inference Engine provides a special mechanism that allows to add the support of shape inference for custom operations. 
-This mechanism is described in the [Extensibility documentation](Extensibility_DG/Intro.md)
+Operation semantics may impose restrictions on input shapes of the operation. 
+Shape collision during shape propagation may be a sign that a new shape does not satisfy the restrictions. 
+Changing the model input shape may result in intermediate operations shape collision.
+
+Examples of such operations:
+* [Reshape](../ops/shape/Reshape_1.md) operation with a hard-coded output shape value
+* [MatMul](../ops/matrix/MatMul_1.md) operation with the `Const` second input cannot be resized by spatial dimensions due to operation semantics
+
+Model structure and logic should not change significantly after model reshaping.
+- The Global Pooling operation is commonly used to reduce output feature map of classification models output.
+Having the input of the shape [N, C, H, W], Global Pooling returns the output of the shape [N, C, 1, 1].
+Model architects usually express Global Pooling with the help of the `Pooling` operation with the fixed kernel size [H, W].
+During spatial reshape, having the input of the shape [N, C, H1, W1], Pooling with the fixed kernel size [H, W] returns the output of the shape [N, C, H2, W2], where H2 and W2 are commonly not equal to `1`.
+It breaks the classification model structure.
+For example, [publicly available Inception family models from TensorFlow*](https://github.com/tensorflow/models/tree/master/research/slim#pre-trained-models) have this issue.
+
+- Changing the model input shape may significantly affect its accuracy.
+For example, Object Detection models from TensorFlow have resizing restrictions by design. 
+To keep the model valid after the reshape, choose a new input shape that satisfies conditions listed in the `pipeline.config` file. 
+For details, refer to the <a href="_docs_MO_DG_prepare_model_convert_model_tf_specific_Convert_Object_Detection_API_Models.html#tf_od_custom_input_shape">Tensorflow Object Detection API models resizing techniques</a>.
+
+### Extensibility
+The Inference Engine provides a special mechanism that allows adding support of shape inference for custom operations. This mechanism is described in the [Extensibility documentation](Extensibility_DG/Intro.md)
+
+## Introduction (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+OpenVINO™ provides the following methods for runtime model reshaping:
+
+* Set a new input shape with the [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape) method.
+
+  The [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape) method updates input shapes and propagates them down to the outputs of the model through all intermediate layers.
+
+  **NOTES**:
+  * Model Optimizer converts topologies keeping shape-calculating sub-graphs by default, which enables correct shape propagation during reshaping in most cases.
+  * Older versions of IRs are not guaranteed to reshape successfully. Please regenerate them with the Model Optimizer of the latest version of OpenVINO™.
+  * If an ONNX model does not have a fully defined input shape and the model was imported with the ONNX importer, reshape the model before loading it to the plugin.
+
+
+* Set a new batch dimension value with the [IENetwork.batch_size](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.batch_size) method.
+
+   The meaning of a model batch may vary depending on the model design. This method does not deduce batch placement for inputs from the model architecture. It assumes that the batch is placed at the zero index in the shape for all inputs and uses the [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape) method to propagate updated shapes through the model.
+
+The method transforms the model before a new shape propagation to relax a hard-coded batch dimension in the model, if any.
+
+Use [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape) rather than [IENetwork.batch_size](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.batch_size) to set new input shapes for the model if the model has:
+
+  * Multiple inputs with different zero-index dimension meanings
+  * Input without a batch dimension
+  * 0D, 1D, or 3D shape
+
+The [IENetwork.batch_size](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.batch_size) method is a high-level API method that wraps the [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape)  method call and works for trivial models from the batch placement standpoint. Use [IENetwork.reshape](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.reshape) for other models.
+
+Using the [IENetwork.batch_size](api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.batch_size) method for models with a non-zero index batch placement or for models with inputs that do not have a batch dimension may lead to undefined behaviour.
+
+You can change input shapes multiple times using the `IENetwork.reshape` and `IENetwork.batch_size` methods in any order. If a model has a hard-coded batch dimension, use `IENetwork.batch_size` first to change the batch, then call `IENetwork.reshape` to update other dimensions, if needed.
+
+Inference Engine takes three kinds of a model description as an input, which are converted into an IENetwork object:
+
+1. Intermediate Representation (IR) through `IECore.read_network`
+2. ONNX model through `IECore.read_network`
+3. nGraph function through the constructor of IENetwork
+
+IENetwork keeps an `ngraph::Function` object with the model description internally. The object should have fully defined input shapes to be successfully loaded to the Inference Engine plugins. To resolve undefined input dimensions of a model, call the `IENetwork.reshape` method providing new input shapes before loading to the Inference Engine plugin.
+
+Run the following code right after IENetwork creation to explicitly check for model input names and shapes:
+
+To feed input data of a shape that is different from the model input shape, reshape the model first.
+
+Once the input shape of IENetwork is set, call the `IECore.load_network` method to get an ExecutableNetwork object for inference with updated shapes.
+
+There are other approaches to reshape the model during the stage of IR generation or [nGraph function](https://docs.openvinotoolkit.org/latest/openvino_docs_nGraph_DG_PythonAPI.html#create_an_ngraph_function_from_a_graph) creation.
+
+Practically, some models are not ready to be reshaped. In this case, a new input shape cannot be set with the Model Optimizer or the `IENetwork.reshape` method.
+
+### Troubleshooting Reshape Errors
+Operation semantics may impose restrictions on input shapes of the operation. Shape collision during shape propagation may be a sign that a new shape does not satisfy the restrictions. Changing the model input shape may result in intermediate operations shape collision.
+
+Examples of such operations:
+
+* Reshape operation with a hard-coded output shape value
+* MatMul operation with the Const second input cannot be resized by spatial dimensions due to operation semantics
+
+A model's structure and logic should not significantly change after model reshaping.
+
+* The Global Pooling operation is commonly used to reduce output feature map of classification models output. Having the input of the shape [N, C, H, W], Global Pooling returns the output of the shape [N, C, 1, 1]. Model architects usually express Global Pooling with the help of the Pooling operation with the fixed kernel size [H, W]. During spatial reshape, having the input of the shape [N, C, H1, W1], Pooling with the fixed kernel size [H, W] returns the output of the shape [N, C, H2, W2], where H2 and W2 are commonly not equal to 1. It breaks the classification model structure. For example, publicly available Inception family models from TensorFlow* have this issue.
+
+* Changing the model input shape may significantly affect its accuracy. For example, Object Detection models from TensorFlow have resizing restrictions by design. To keep the model valid after the reshape, choose a new input shape that satisfies conditions listed in the pipeline.config file. For details, refer to the Tensorflow Object Detection API models resizing techniques.
+
+
+### Usage of the Reshape Method
+
+The primary method of the feature is `IENetwork.reshape`. It gets new input shapes and propagates it from input to output for all intermediates layers of the given network. Use `IENetwork.input_info` to get names of input_layers and `.tensor_desc.dims` to get the current network input shape.
+
+The following code example shows how to reshape a model to the size of an input image.
+
+```python
+import cv2
+import numpy as np
+from openvino.inference_engine import IECore
+
+ie = IECore()
+
+# Read an input image and transpose input to NCWH
+image = cv2.imread(path_to_image_file)
+input_image = image.transpose((2, 0, 1))
+input_image = np.expand_dims(input_image, axis=0)
+
+# Load the model and get input info
+# Note that this model must support arbitrary input shapes
+net = ie.read_network(model=path_to_xml_file)
+input_layer = next(iter(net.input_info))
+print(f"Input shape: {net.input_info[input_blob].tensor_desc.dims}")
+
+# Call reshape
+net.reshape({input_layer: input_image.shape})
+print(f"New input shape: {net.input_info[input_blob].tensor_desc.dims}")
+
+# Load the model to the device and proceed with inference
+exec_net = ie.load_network(network=net, device_name="CPU")
+```
+
+### Extensibility
+The Inference Engine provides a special mechanism that allows adding support of shape inference for custom operations. This mechanism is described in the [Extensibility documentation](Extensibility_DG/Intro.md)
+
+### See Also:
+
+[Hello Reshape Python Sample](../../inference_engine/ie_bridges/python/sample/hello_reshape_ssd/README.html)
--- a/docs/IE_DG/img/BASIC_FLOW_IE_C.svg
+++ b/docs/IE_DG/img/BASIC_FLOW_IE_C.svg
--- a/docs/IE_DG/img/BASIC_FLOW_MO.svg
+++ b/docs/IE_DG/img/BASIC_FLOW_MO.svg
--- a/docs/IE_DG/img/BASIC_FLOW_MO_simplified.svg
+++ b/docs/IE_DG/img/BASIC_FLOW_MO_simplified.svg
--- a/docs/IE_DG/img/BASIC_IE_API_workflow_Cpp.svg
+++ b/docs/IE_DG/img/BASIC_IE_API_workflow_Cpp.svg
--- a/docs/IE_DG/img/BASIC_IE_API_workflow_Python.svg
+++ b/docs/IE_DG/img/BASIC_IE_API_workflow_Python.svg
--- a/docs/IE_DG/img/DEVELOPMENT_FLOW_V3_crunch.svg
+++ b/docs/IE_DG/img/DEVELOPMENT_FLOW_V3_crunch.svg
--- a/docs/IE_DG/img/IMPLEMENT_PIPELINE_with_API_C.svg
+++ b/docs/IE_DG/img/IMPLEMENT_PIPELINE_with_API_C.svg
--- a/docs/IE_DG/img/IMPLEMENT_PIPELINE_with_API_Python.svg
+++ b/docs/IE_DG/img/IMPLEMENT_PIPELINE_with_API_Python.svg
--- a/docs/IE_DG/img/LATENCY_VS_THROUGHPUT.svg
+++ b/docs/IE_DG/img/LATENCY_VS_THROUGHPUT.svg
--- a/docs/IE_DG/img/Latency_Throughput.png
+++ b/docs/IE_DG/img/Latency_Throughput.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17298610e06aed2ccfdad44c22a84a410ff4511456b28f6953091d9e593ab7fd
+size 53668
--- a/docs/IE_DG/img/WHAT_TO_USE.svg
+++ b/docs/IE_DG/img/WHAT_TO_USE.svg
--- a/docs/IE_DG/img/applying_low_latency_2.png
+++ b/docs/IE_DG/img/applying_low_latency_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26ff5d3d42b9838a14481425af8fe8aed791b26fc00a062b91128ba9d5528549
+size 743788
--- a/docs/IE_DG/img/cpu_streams_explained.png
+++ b/docs/IE_DG/img/cpu_streams_explained.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5cf2212b3634a264722b386899197a7f0fa56fbdad97c017d2733cc0d2694d4
+size 105457
--- a/docs/IE_DG/img/development_deployment.png
+++ b/docs/IE_DG/img/development_deployment.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc7766d83d95b40e1dc6b5a63f7a801adf9780664f9ae6d4c88676c2f5b88688
+size 107123
--- a/docs/IE_DG/img/diagram_workflow.svg
+++ b/docs/IE_DG/img/diagram_workflow.svg
--- a/docs/IE_DG/img/ie_api_cpp.png
+++ b/docs/IE_DG/img/ie_api_cpp.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5bcecb8ddef4fdc797474da92ab6a9977a43d0d68e6fec75b2e0a41441042c2
+size 22993
--- a/docs/IE_DG/img/ie_api_integration_cpp.png
+++ b/docs/IE_DG/img/ie_api_integration_cpp.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9831807da09a8b1d75b5c20e5390526efdbacc7ccf9120221225d633abeb13be
+size 47476
--- a/docs/IE_DG/img/ie_api_integration_python.png
+++ b/docs/IE_DG/img/ie_api_integration_python.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e9464a77d206d7fd642110a8c409f90b3885fb85e7d132012e86a5edfc72aeb
+size 41636
--- a/docs/IE_DG/img/ie_api_python.png
+++ b/docs/IE_DG/img/ie_api_python.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72b8f5803b63465d349e5757fe59938269619f761d9a67dcfc03ab8e54bc0c3d
+size 19158
--- a/docs/IE_DG/img/ie_workflow_steps.png
+++ b/docs/IE_DG/img/ie_workflow_steps.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62dac902ffb5f26ee034203910a77112ca87c7fc80a511cbd500930bb2919118
+size 17983
--- a/docs/IE_DG/img/integration_process.png
+++ b/docs/IE_DG/img/integration_process.png
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9fff52e5faaf108371db87e53959453216554152b15ca0432b1541f94def297e
-size 19145
--- a/docs/IE_DG/img/latency_throughput_1.png
+++ b/docs/IE_DG/img/latency_throughput_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17298610e06aed2ccfdad44c22a84a410ff4511456b28f6953091d9e593ab7fd
+size 53668
--- a/docs/IE_DG/img/llt2_use_const_initializer.png
+++ b/docs/IE_DG/img/llt2_use_const_initializer.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9858dbc95426c44d8f11a86936f586ebf4f1d0b8c88ba389d9f89c2948f58ea3
+size 62051
--- a/docs/IE_DG/img/output_trimmed.png
+++ b/docs/IE_DG/img/output_trimmed.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e47c2b259bea6d3539f0c4556de4cc3a07f6d60af54e1cf32002b4a7bc2cc90a
+size 25231
--- a/docs/IE_DG/img/resnet_269.png
+++ b/docs/IE_DG/img/resnet_269.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92d36b9527a3e316cd9eb2b6f5054c312466df004e4aa9c3458e165330bc6561
+size 24157
--- a/docs/IE_DG/img/vtune_async.png
+++ b/docs/IE_DG/img/vtune_async.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c47ede993681ba3f0a3e3f4274369ee1854365b1bcd1b5cb0f649a781fdf51bd
+size 6215
--- a/docs/IE_DG/img/vtune_option.png
+++ b/docs/IE_DG/img/vtune_option.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a82b414dbc4f7ce2eae625bb7c9c7b88c154a7c476374683dd9886564560f67
+size 7951
--- a/docs/IE_DG/img/vtune_regular.png
+++ b/docs/IE_DG/img/vtune_regular.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a4fce51076df19fbca04a36d6886765771f8ffc174bebbd751bfc77d91ab1f2
+size 7081
--- a/docs/IE_DG/img/vtune_timeline.png
+++ b/docs/IE_DG/img/vtune_timeline.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c514316f78f04e8c000f6b95dc579d8c63c57f00c4c980ea4d358a6a4f1b9d7e
+size 8744
--- a/docs/IE_DG/img/vtune_topdown_view.jpg
+++ b/docs/IE_DG/img/vtune_topdown_view.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40c4b9096ef264807d930fe64d427f53a69ce2247c836415e64c5aa72d9f245e
+size 36468
--- a/docs/IE_DG/img/workflow_steps.png
+++ b/docs/IE_DG/img/workflow_steps.png
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5e22bc22d614c7335ae461a8ce449ea8695973d755faca718cf74b95972c94e2
-size 19773
--- a/docs/IE_DG/inference_engine_intro.md
+++ b/docs/IE_DG/inference_engine_intro.md
@@ -1,11 +1,11 @@
 # Introduction to Inference Engine {#openvino_docs_IE_DG_inference_engine_intro}

-> **NOTE:** [Intel® System Studio](https://software.intel.com/en-us/system-studio) is an all-in-one, cross-platform tool suite, purpose-built to simplify system bring-up and improve system and IoT device application performance on Intel® platforms. If you are using the Intel® Distribution of OpenVINO™ with Intel® System Studio, go to [Get Started with Intel® System Studio](https://software.intel.com/en-us/articles/get-started-with-openvino-and-intel-system-studio-2019).
+> **NOTE**: [Intel® System Studio](https://software.intel.com/en-us/system-studio) is an all-in-one, cross-platform tool suite, purpose-built to simplify system bring-up and improve system and IoT device application performance on Intel® platforms. If you are using the Intel® Distribution of OpenVINO™ with Intel® System Studio, go to [Get Started with Intel® System Studio](https://software.intel.com/en-us/articles/get-started-with-openvino-and-intel-system-studio-2019).

 This Guide provides an overview of the Inference Engine describing the typical workflow for performing
 inference of a pre-trained and optimized deep learning model and a set of sample applications.

-> **NOTE:** Before you perform inference with the Inference Engine, your models should be converted to the Inference Engine format using the Model Optimizer or built directly in run-time using nGraph API. To learn about how to use Model Optimizer, refer to the [Model Optimizer Developer Guide](../MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). To learn about the pre-trained and optimized models delivered with the OpenVINO™ toolkit, refer to [Pre-Trained Models](@ref omz_models_intel_index).
+> **NOTE**: Before you perform inference with the Inference Engine, your models should be converted to the Inference Engine format using the Model Optimizer or built directly in run-time using nGraph API. To learn about how to use Model Optimizer, refer to the [Model Optimizer Developer Guide](../MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md). To learn about the pre-trained and optimized models delivered with the OpenVINO™ toolkit, refer to [Pre-Trained Models](@ref omz_models_group_intel).

 After you have used the Model Optimizer to create an Intermediate Representation (IR), use the Inference Engine to infer the result for a given input data.

--- a/docs/IE_DG/integrate_with_customer_application_python.md
+++ b/docs/IE_DG/integrate_with_customer_application_python.md
@@ -0,0 +1,152 @@
+# Integrate Inference Engine with Your Python Application {#openvino_docs_IE_DG_integrate_with_customer_application_python}
+
+This document explains how to integrate and use the Inference Engine API with your Python application.   
+
+The following diagram illustrates the typical Inference Engine Python API workflow:
+![ie_api_flow_python] 
+
+Read the sections below to learn about each item.
+
+## Link with Inference Engine Library
+
+To make use of the Inference Engine functionality, import IECore to your application: 
+
+```py
+from openvino.inference_engine import IECore
+``` 
+ 
+## Use Inference Engine API Implement Inference Pipeline
+
+This section provides step-by-step instructions to implement a typical inference pipeline with the Inference Engine Python API:   
+
+![ie_api_use_python]
+
+### Step 1. Create Inference Engine Core
+
+Use the following code to create Inference Engine Core to manage available devices and read network objects: 
+```py
+ie = IECore()
+``` 
+### Step 2 (Optional). Read model. Configure Input and Output of the Model
+
+@sphinxdirective
+.. raw:: html
+
+    <div class="collapsible-section">
+@endsphinxdirective
+
+Optionally, configure input and output of the model using the steps below: 
+
+1. Read model 
+   @sphinxdirective
+      
+   .. tab:: IR
+   
+      .. code-block:: python
+   
+         net = ie.read_network(model="model.xml")
+   
+   .. tab:: ONNX
+      
+      .. code-block:: python
+         
+         net = ie.read_network(model="model.onnx")
+   
+   .. tab:: nGraph
+      
+      .. code-block:: python
+         
+         // TBD
+   
+   @endsphinxdirective
+
+2. Request input and output information using input_info, outputs 
+   ```py
+   input_name = next(iter(net.input_info))  
+
+   output_name = next(iter(net.outputs)) 
+   ``` 
+   Information for this input layer is stored in input_info. The next cell prints the input layout, precision and shape. 
+   ```py
+   print(f"input layout: {net.input_info[input_layer].layout}") 
+   print(f"input precision: {net.input_info[input_layer].precision}") 
+   print(f"input shape: {net.input_info[input_layer].tensor_desc.dims}") 
+   ```
+   This cell output tells us that the model expects inputs with a shape of [1,3,224,224], and that this is in NCHW layout. This means that the model expects input data with a batch size (N) of 1, 3 channels (C), and images of a height (H) and width (W) of 224. The input data is expected to be of FP32 (floating point) precision. 
+    
+   Getting the output layout, precision and shape is similar to getting the input layout, precision and shape. 
+   ```py
+   print(f"output layout: {net.outputs[output_layer].layout}") 
+   print(f"output precision: {net.outputs[output_layer].precision}") 
+   print(f"output shape: {net.outputs[output_layer].shape}") 
+   ```
+   This cell output shows that the model returns outputs with a shape of [1, 1001], where 1 is the batch size (N) and 1001 the number of classes (C). The output is returned as 32-bit floating point. 
+
+@sphinxdirective
+.. raw:: html
+
+    </div>
+@endsphinxdirective 
+
+### Step 3. Load model to the Device 
+
+Load the model to the device using `load_network()`:
+
+@sphinxdirective
+   
+.. tab:: IR
+
+   .. code-block:: python
+
+      exec_net = ie.load_network(network= "model.xml", device_name="CPU") 
+.. tab:: ONNX
+   
+   .. code-block:: python
+      
+      exec_net = ie.load_network(network= "model.onnx", device_name="CPU") 
+
+.. tab:: nGraph
+   
+   .. code-block:: python
+      
+      // TBD
+
+.. tab:: Model from Step 2
+   
+   .. code-block:: python
+   
+      exec_net = ie.load_network(network=net, device_name="CPU")
+
+@endsphinxdirective
+
+### Step 4. Prepare input 
+```py
+import cv2 
+import numpy as np 
+
+image = cv2.imread("image.png") 
+
+# Resize with OpenCV your image if needed to match with net input shape 
+# res_image = cv2.resize(src=image, dsize=(W, H)) 
+
+# Converting image to NCHW format with FP32 type 
+input_data = np.expand_dims(np.transpose(image, (2, 0, 1)), 0).astype(np.float32) 
+```
+
+### Step 5. Start Inference
+```py
+input_name = next(iter(net.input_info))
+result = exec_net.infer({input_name: input_data}) 
+``` 
+
+### Step 6. Process the Inference Results 
+```py
+output_name = next(iter(net.outputs))
+output = result[output_name] 
+```
+
+## Run Application
+
+[ie_api_flow_python]: img/ie_api_python.png
+[ie_api_use_python]: img/ie_api_integration_python.png
+ 
--- a/docs/IE_DG/network_state_intro.md
+++ b/docs/IE_DG/network_state_intro.md
@@ -7,7 +7,7 @@ This section describes how to work with stateful networks in OpenVINO toolkit, s

 The section additionally provides small examples of stateful network and code to infer it.

-## What is a Stateful Network
+## What is a Stateful Network?

 Several use cases require processing of data sequences. When length of a sequence is known and small enough, 
 we can process it with RNN like networks that contain a cycle inside. But in some cases, like online speech recognition of time series 
@@ -209,9 +209,135 @@ Decsriptions can be found in [Samples Overview](./Samples_Overview.md)
 [state_network_example]: ./img/state_network_example.png


-## LowLatency Transformation
+## LowLatency Transformations

-If the original framework does not have a special API for working with states, after importing the model, OpenVINO representation will not contain Assign/ReadValue layers. For example, if the original ONNX model contains RNN operations, IR will contain TensorIterator operations and the values will be obtained only after the execution of whole TensorIterator primitive, intermediate values from each iteration will not be available. To be able to work with these intermediate values of each iteration and receive them with a low latency after each infer request, a special LowLatency transformation was introduced.
+If the original framework does not have a special API for working with states, after importing the model, OpenVINO representation will not contain Assign/ReadValue layers. For example, if the original ONNX model contains RNN operations, IR will contain TensorIterator operations and the values will be obtained only after execution of the whole TensorIterator primitive. Intermediate values from each iteration will not be available. To enable you to work with these intermediate values of each iteration and receive them with a low latency after each infer request, special LowLatency and LowLatency2 transformations were introduced.
+
+### How to get TensorIterator/Loop operaions from different frameworks via ModelOptimizer.
+
+**ONNX and frameworks supported via ONNX format:** *LSTM, RNN, GRU* original layers are converted to the TensorIterator operation. TensorIterator body contains LSTM/RNN/GRU Cell. Peepholes, InputForget modifications are not supported, sequence_lengths optional input is supported.
+*ONNX Loop* layer is converted to the OpenVINO Loop operation.
+
+**MXNet:** *LSTM, RNN, GRU* original layers are converted to TensorIterator operation, TensorIterator body contains LSTM/RNN/GRU Cell operations.
+
+**TensorFlow:** *BlockLSTM* is converted to TensorIterator operation, TensorIterator body contains LSTM Cell operation, Peepholes, InputForget modifications are not supported.
+*While* layer is converted to TensorIterator, TensorIterator body can contain any supported operations, but dynamic cases, when count of iterations cannot be calculated in shape inference (ModelOptimizer conversion) time, are not supported.
+
+**TensorFlow2:** *While* layer is converted to Loop operation. Loop body can contain any supported operations.
+
+**Kaldi:** Kaldi models already contain Assign/ReadValue (Memory) operations after model conversion. TensorIterator/Loop operations are not generated.
+
+## LowLatencу2
+
+LowLatency2 transformation changes the structure of the network containing [TensorIterator](../ops/infrastructure/TensorIterator_1.md) and [Loop](../ops/infrastructure/Loop_5.md) by adding the ability to work with the state, inserting the Assign/ReadValue layers as it is shown in the picture below.
+
+### The differences between LowLatency and LowLatency2**:
+
+* Unrolling of TensorIterator/Loop operations became a part of LowLatency2, not a separate transformation. After invoking the transformation, the network can be serialized and inferred without re-invoking the transformation.
+* Added support for TensorIterator and Loop operations with multiple iterations inside. TensorIterator/Loop will not be unrolled in this case.
+* Resolved the ‘Parameters connected directly to ReadValues’ limitation. To apply the previous version of the transformation in this case, additional manual manipulations were required, now the case is processed automatically.
+#### Example of applying LowLatency2 transformation:
+![applying_low_latency_2_example](./img/applying_low_latency_2.png)
+
+After applying the transformation, ReadValue operations can receive other operations as an input, as shown in the picture above. These inputs should set the initial value for initialization of ReadValue operations. However, such initialization is not supported in the current State API implementation. Input values are ignored and the initial values for the ReadValue operations are set to zeros unless otherwise specified by the user via [State API](#openvino-state-api).
+
+### Steps to apply LowLatency2 Transformation
+
+1. Get CNNNetwork. Either way is acceptable:
+
+	* [from IR or ONNX model](./Integrate_with_customer_application_new_API.md)
+	* [from nGraph Function](../nGraph_DG/build_function.md)
+
+2. Change the number of iterations inside TensorIterator/Loop nodes in the network using the [Reshape](ShapeInference.md) feature. 
+
+For example, the *sequence_lengths* dimension of input of the network > 1, it means the TensorIterator layer has number_of_iterations > 1. You can reshape the inputs of the network to set *sequence_dimension* to exactly 1.
+
+```cpp
+
+// Network before reshape: Parameter (name: X, shape: [2 (sequence_lengths), 1, 16]) -> TensorIterator (num_iteration = 2, axis = 0) -> ...
+
+cnnNetwork.reshape({"X" : {1, 1, 16});
+
+// Network after reshape: Parameter (name: X, shape: [1 (sequence_lengths), 1, 16]) -> TensorIterator (num_iteration = 1, axis = 0) -> ...
+	
+```
+**Unrolling**: If the LowLatency2 transformation is applied to a network containing TensorIterator/Loop nodes with exactly one iteration inside, these nodes are unrolled; otherwise, the nodes remain as they are. Please see [the picture](#example-of-applying-lowlatency2-transformation) for more details.
+
+3. Apply LowLatency2 transformation
+```cpp
+#include "ie_transformations.hpp"
+
+...
+
+InferenceEngine::lowLatency2(cnnNetwork); // 2nd argument 'use_const_initializer = true' by default
+```
+**Use_const_initializer argument**
+
+By default, the LowLatency2 transformation inserts a constant subgraph of the same shape as the previous input node, and with zero values as the initializing value for ReadValue nodes, please see the picture below. We can disable insertion of this subgraph by passing the `false` value for the `use_const_initializer` argument.
+
+```cpp
+InferenceEngine::lowLatency2(cnnNetwork, false);
+```
+
+![use_const_initializer_example](./img/llt2_use_const_initializer.png)
+
+**State naming rule:**  a name of a state is a concatenation of names: original TensorIterator operation, Parameter of the body, and additional suffix "variable_" + id (0-base indexing, new indexing for each TensorIterator). You can use these rules to predict what the name of the inserted State will be after the transformation is applied. For example:
+```cpp
+	// Precondition in ngraph::function.
+	// Created TensorIterator and Parameter in body of TensorIterator with names
+	std::string tensor_iterator_name = "TI_name"
+	std::string body_parameter_name = "param_name"
+	std::string idx = "0"; // it's a first variable in the network
+
+	// The State will be named "TI_name/param_name/variable_0"
+	auto state_name = tensor_iterator_name + "//" + body_parameter_name + "//" + "variable_" + idx;
+
+	InferenceEngine::CNNNetwork cnnNetwork = InferenceEngine::CNNNetwork{function};
+	InferenceEngine::lowLatency2(cnnNetwork);
+
+	InferenceEngine::ExecutableNetwork executableNetwork = core->LoadNetwork(/*cnnNetwork, targetDevice, configuration*/);
+
+	// Try to find the Variable by name
+	auto states = executableNetwork.QueryState();
+	for (auto& state : states) {
+		auto name = state.GetName();
+		if (name == state_name) {
+			// some actions
+		}
+	}
+```
+
+4. Use state API. See sections [OpenVINO state API](#openvino-state-api), [Example of stateful network inference](#example-of-stateful-network-inference).
+
+### Known Limitations
+1. Unable to execute [Reshape](ShapeInference.md) to change the number iterations of TensorIterator/Loop layers to apply the transformation correctly due to hardcoded values of shapes somewhere in the network.
+
+	The only way you can change the number iterations of TensorIterator/Loop layer is to use the Reshape feature, but networks can be non-reshapable, the most common reason is that the value of shapes is hardcoded in a constant somewhere in the network. 
+
+	![low_latency_limitation_2](./img/low_latency_limitation_2.png)
+
+	**Current solution:** Trim non-reshapable layers via [ModelOptimizer CLI](../MO_DG/prepare_model/convert_model/Converting_Model.md) `--input`, `--output`. For example, the parameter and the problematic constant in the picture above can be trimmed using the following command line option: 
+	`--input Reshape_layer_name`. The problematic constant can be also replaced using ngraph, as shown in the example below.
+
+```cpp
+	// nGraph example. How to replace a Constant with hardcoded values of shapes in the network with another one with the new values.
+	// Assume we know which Constant (const_with_hardcoded_shape) prevents the reshape from being applied.
+	// Then we can find this Constant by name on the network and replace it with a new one with the correct shape.
+	auto func = cnnNetwork.getFunction();
+	// Creating the new Constant with a correct shape.
+	// For the example shown in the picture above, the new values of the Constant should be 1, 1, 10 instead of 1, 49, 10
+	auto new_const = std::make_shared<ngraph::opset6::Constant>( /*type, shape, value_with_correct_shape*/ );
+	for (const auto& node : func->get_ops()) {
+		// Trying to find the problematic Constant by name.
+		if (node->get_friendly_name() == "name_of_non_reshapable_const") {
+			auto const_with_hardcoded_shape = std::dynamic_pointer_cast<ngraph::opset6::Constant>(node);
+			// Replacing the problematic Constant with a new one. Do this for all the problematic Constants in the network, then 
+			// you can apply the reshape feature.
+			ngraph::replace_node(const_with_hardcoded_shape, new_const);
+		}
+	}
+```
+## [DEPRECATED] LowLatency

 LowLatency transformation changes the structure of the network containing [TensorIterator](../ops/infrastructure/TensorIterator_1.md) and [Loop](../ops/infrastructure/Loop_5.md) by adding the ability to work with the state, inserting the Assign/ReadValue layers as it is shown in the picture below.

@@ -277,7 +403,7 @@ InferenceEngine::LowLatency(cnnNetwork);
 4. Use state API. See sections [OpenVINO state API](#openvino-state-api), [Example of stateful network inference](#example-of-stateful-network-inference).

 
-### Known Limitations
+### Known Limitations for LowLatency [DEPRECATED]
 1. Parameters connected directly to ReadValues (States) after the transformation is applied are not allowed.

 	Unnecessary parameters may remain on the graph after applying the transformation. The automatic handling of this case inside the transformation is not possible now. Such Parameters should be removed manually from `ngraph::Function` or replaced with a Constant.
@@ -285,7 +411,7 @@ InferenceEngine::LowLatency(cnnNetwork);
 	![low_latency_limitation_1](./img/low_latency_limitation_1.png)

 	**Current solutions:** 
-	* Replace Parameter with Constant (freeze) with the value [0, 0, 0 … 0] via [ModelOptimizer CLI](../MO_DG/prepare_model/convert_model/Converting_Model_General.md) `--input` or `--freeze_placeholder_with_value`.
+	* Replace Parameter with Constant (freeze) with the value [0, 0, 0 … 0] via [ModelOptimizer CLI](../MO_DG/prepare_model/convert_model/Converting_Model.md) `--input` or `--freeze_placeholder_with_value`.
 	* Use ngraph API to replace Parameter with Constant.

 		```cpp
@@ -310,7 +436,7 @@ InferenceEngine::LowLatency(cnnNetwork);

 	![low_latency_limitation_2](./img/low_latency_limitation_2.png)

-	**Current solution:** trim non-reshapable layers via [ModelOptimizer CLI](../MO_DG/prepare_model/convert_model/Converting_Model_General.md) `--input`, `--output`. For example, we can trim the Parameter and the problematic Constant in the picture above, using the following command line option: 
+	**Current solution:** trim non-reshapable layers via [ModelOptimizer CLI](../MO_DG/prepare_model/convert_model/Converting_Model.md) `--input`, `--output`. For example, we can trim the Parameter and the problematic Constant in the picture above, using the following command line option: 
 	`--input Reshape_layer_name`. We can also replace the problematic Constant using ngraph, as shown in the example below.

 ```cpp
--- a/docs/IE_DG/protecting_model_guide.md
+++ b/docs/IE_DG/protecting_model_guide.md
@@ -1,7 +1,7 @@
 # Using Encrypted Models with OpenVINO&trade;  {#openvino_docs_IE_DG_protecting_model_guide}

 Deploying deep-learning capabilities to edge devices can present security
-challenges. For example, ensuring inference integrity or providing copyright
+challenges, for example, ensuring inference integrity or providing copyright
 protection of your deep-learning models.

 One possible solution is to use cryptography to protect models as they are
@@ -14,43 +14,41 @@ This guide demonstrates how to use OpenVINO securely with protected models.

 ## Secure Model Deployment

-After a model is optimized by the OpenVINO Model Optimizer, it's then deployed
+After a model is optimized by the OpenVINO Model Optimizer, it's deployed
 to target devices in the Intermediate Representation (IR) format. An optimized
-model is stored on an edge device and executed by the Inference Engine.
+model is stored on an edge device and executed by the Inference Engine. 
+(ONNX and nGraph models can also be read natively by the Inference Engine.)

 To protect deep-learning models, you can encrypt an optimized model before
 deploying it to the edge device. The edge device should keep the stored model
 protected at all times and have the model decrypted **in runtime only** for use
 by the Inference Engine.

-![deploy_encrypted_model]
+![deploy_encrypted_model](img/deploy_encrypted_model.png)

 ## Loading Encrypted Models

 The OpenVINO Inference Engine requires model decryption before loading. Allocate
-a temporary memory block for model decryption, and use
-`InferenceEngine::Core::ReadNetwork` method to load the model from memory buffer.
-For more information, see the `InferenceEngine::Core` Class
-Reference Documentation.
+a temporary memory block for model decryption and use the 
+`InferenceEngine::Core::ReadNetwork` method to load the model from a memory buffer.
+For more information, see the `InferenceEngine::Core` Class Reference Documentation.

@snippet snippets/protecting_model_guide.cpp part0

-Hardware-based protection, such as Intel&reg; Software Guard Extensions
-(Intel&reg; SGX), can be utilized to protect decryption operation secrets and
+Hardware-based protection such as Intel&reg; Software Guard Extensions
+(Intel&reg; SGX) can be utilized to protect decryption operation secrets and
 bind them to a device. For more information, go to [Intel&reg; Software Guard
 Extensions](https://software.intel.com/en-us/sgx).

 Use `InferenceEngine::Core::ReadNetwork()` to set model representations and
 weights respectively.

-Currently there are no possibility to read external weights from memory for ONNX models.
+Currently there is no way to read external weights from memory for ONNX models.
 The `ReadNetwork(const std::string& model, const Blob::CPtr& weights)` function
 should be called with `weights` passed as an empty `Blob`.

@snippet snippets/protecting_model_guide.cpp part1

-[deploy_encrypted_model]: img/deploy_encrypted_model.png
-
 ## Additional Resources

 - Intel® Distribution of OpenVINO™ toolkit home page: [https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)
--- a/docs/IE_DG/supported_plugins/AUTO.md
+++ b/docs/IE_DG/supported_plugins/AUTO.md
@@ -0,0 +1,332 @@
+# Auto-Device Plugin {#openvino_docs_IE_DG_supported_plugins_AUTO}
+
+## Auto-Device Plugin Execution (C++)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective
+
+The AUTO device is a new, special "virtual" or "proxy" device in the OpenVINO™ toolkit.
+
+Use "AUTO" as the device name to delegate selection of an actual accelerator to OpenVINO. The Auto-device plugin internally recognizes and selects devices from among CPU, integrated GPU and discrete Intel GPUs (when available) depending on the device capabilities and the characteristics of CNN models (for example, precision). Then the Auto-device assigns inference requests to the selected device.
+
+From the application's point of view, this is just another device that handles all accelerators in the full system.
+
+With the 2021.4 release, Auto-device setup is done in three major steps:
+1. Configure each device as usual (for example, via the conventional `SetConfig()` method)
+2. Load a network to the Auto-device plugin. This is the only change needed in your application.
+3. As with any other executable network resulting from `LoadNetwork()`, create as many requests as needed to saturate the devices. 
+
+These steps are covered below in detail.
+
+### Defining and Configuring the Auto-Device Plugin
+Following the OpenVINO convention for devices names, the Auto-device uses the label "AUTO". The only configuration option for Auto-device is a limited device list:
+
+| Parameter name     | Parameter values      | Default            |             Description                                                      |
+| :---               | :---                  | :---               |:-----------------------------------------------------------------------------|
+| "AUTO_DEVICE_LIST" | comma-separated device names with no spaces| N/A | Device candidate list to be selected|
+
+You can use the configuration name directly as a string or use `IE::KEY_AUTO_DEVICE_LIST` from `ie_plugin_config.hpp`, which defines the same string.
+
+There are two ways to use Auto-device:
+1. Directly indicate device by "AUTO" or an empty string:
+@snippet snippets/AUTO0.cpp part0
+
+2. Use the Auto-device configuration:
+@snippet snippets/AUTO1.cpp part1
+
+Both methods allow limiting the list of device candidates for the AUTO plugin.
+
+> **NOTE**: The Inference Engine lets you use "GPU" as an alias for "GPU.0" in function calls. 
+
+The Auto-device plugin supports query device optimization capabilities in metric.
+
+| Parameter name                 | Parameter values         |
+| :---                           | :---                     |
+| "OPTIMIZATION_CAPABILITIES"    | Auto-Device capabilities |
+
+### Enumerating Devices and Selection Logic
+
+The Inference Engine now features a dedicated API to enumerate devices and their capabilities. 
+See [Hello Query Device C++ Sample](../../../inference-engine/samples/hello_query_device/README.md).
+This is the example output from the sample (truncated to device names only):
+
+```sh
+./hello_query_device
+Available devices: 
+    Device: CPU
+...
+    Device: GPU.0
+...
+    Device: GPU.1
+```
+
+### Default Auto-Device Selection Logic
+
+With the 2021.4 release, the Auto-Device selects the most suitable device using the following default logic:
+
+1. Check if dGPU (discrete), iGPU (integrated) and CPU devices are available
+2. Get the precision of the input model, such as FP32
+3. According to the priority of dGPU, iGPU, and CPU (in this order), if the device supports the precision of the input network, select it as the most suitable device
+
+For example, CPU, dGPU and iGPU can support the following precision and optimization capabilities:
+
+| Device   | OPTIMIZATION_CAPABILITIES       |
+| :---     | :---                            |
+| CPU      | WINOGRAD FP32 FP16 INT8 BIN     |
+| dGPU     | FP32 BIN BATCHED_BLOB FP16 INT8 |
+| iGPU     | FP32 BIN BATCHED_BLOB FP16 INT8 |
+
+* When the application uses the Auto-device to run FP16 IR on a system with CPU, dGPU and iGPU, Auto-device will offload this workload to dGPU.
+* When the application uses the Auto-device to run FP16 IR on a system with CPU and iGPU, Auto-device will offload this workload to iGPU.
+* When the application uses the Auto-device to run WINOGRAD-enabled IR on a system with CPU, dGPU and iGPU, Auto-device will offload this workload to CPU.
+
+In cases when loading the network to dGPU or iGPU fails, CPU is the fall-back choice.
+
+According to the Auto-device selection logic from the previous section, tell the Inference Engine 
+to use the most suitable device from available devices as follows:
+
+@snippet snippets/AUTO2.cpp part2
+
+You can also use the Auto-device plugin to choose a device from a limited choice of devices, in this example CPU and GPU:
+
+@snippet snippets/AUTO3.cpp part3
+
+### Configuring the Individual Devices and Creating the Auto-Device on Top
+
+It is possible to configure each individual device as usual and create the "AUTO" device on top:
+
+@snippet snippets/AUTO4.cpp part4
+
+Alternatively, you can combine all the individual device settings into single config file and load it, allowing the Auto-device plugin to parse and apply it to the right devices. See the code example here:
+
+@snippet snippets/AUTO5.cpp part5
+
+### Using the Auto-Device with OpenVINO Samples and Benchmark App
+
+Note that every OpenVINO sample or application that supports the "-d" (which stands for "device") command-line option transparently accepts the Auto-device. The Benchmark Application is the best example of the optimal usage of the Auto-device. You do not need to set the number of requests and CPU threads, as the application provides optimal out-of-the-box performance. Below is the example command-line to evaluate AUTO performance with that:
+
+@sphinxdirective
+.. tab:: Package, Docker, open-source installation
+
+   .. code-block:: sh
+
+      ./benchmark_app.py –d AUTO –m <model>
+
+.. tab:: pip installation
+
+    .. code-block:: sh
+
+      benchmark_app –d AUTO –m <model>
+
+@endsphinxdirective
+
+
+You can also use the auto-device with limit device choice:
+
+@sphinxdirective
+.. tab:: Package, Docker, open-source installation
+
+   .. code-block:: sh
+
+      ./benchmark_app.py –d AUTO:CPU,GPU –m <model>
+
+.. tab:: pip installation
+
+    .. code-block:: sh
+
+      benchmark_app –d AUTO:CPU,GPU –m <model>
+
+@endsphinxdirective
+
+**NOTES:**
+* The default CPU stream is 1 if using `-d AUTO`. 
+* You can use the FP16 IR to work with Auto-device.
+* No demos are fully optimized for Auto-device yet to select the most suitable device, 
+use GPU streams/throttling, and so on.
+
+## Auto-Device Plugin Execution (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+The AUTO device is a new, special "virtual" or "proxy" device in the OpenVINO™ toolkit.
+
+Use "AUTO" as the device name to delegate selection of an actual accelerator to OpenVINO. The Auto-device plugin internally recognizes and selects devices from among CPU, integrated GPU and discrete Intel GPUs (when available) depending on the device capabilities and the characteristics of CNN models (for example, precision). Then the Auto-device assigns inference requests to the selected device.
+
+From the application's point of view, this is just another device that handles all accelerators in the full system.
+
+With the 2021.4 release, Auto-device setup is done in three major steps:
+
+1. Configure each device as usual (for example, via the conventional [IECore.set_config](https://docs.openvinotoolkit.org/latest/ie_python_api/classie__api_1_1IECore.html#a2c738cee90fca27146e629825c039a05) method).
+2. Load a network to the Auto-device plugin. This is the only change needed in your application.
+3. As with any other executable network resulting from [IECore.load_network](https://docs.openvinotoolkit.org/latest/ie_python_api/classie__api_1_1IECore.html#ac9a2e043d14ccfa9c6bbf626cfd69fcc), create as many requests as needed to saturate the devices. 
+
+These steps are covered below in detail.
+
+### Defining and Configuring the Auto-Device Plugin
+Following the OpenVINO convention for devices names, the Auto-device uses the label "AUTO". The only configuration option for Auto-device is a limited device list:
+
+| Parameter name | Parameter values | Default | Description |
+| -------------- | ---------------- | ------- | ----------- |
+| "AUTO_DEVICE_LIST" | comma-separated device names with no spaces | N/A | Device candidate list to be selected
+
+There are two ways to use the Auto-device plugin:
+
+1. Directly indicate device by "AUTO" or an empty string.
+2. Use the Auto-device configuration
+
+Both methods allow limiting the list of device candidates for the AUTO plugin.
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+# Read a network in IR or ONNX format
+net = ie.read_network(model=path_to_model)
+
+# Load a network on the "AUTO" device
+exec_net = ie.load_network(network=net, device_name="AUTO")
+
+# Optionally specify the list of device candidates for the AUTO plugin
+# The following two lines are equivalent
+exec_net = ie.load_network(network=net, device_name="AUTO:CPU,GPU")
+exec_net = ie.load_network(network=net, device_name="AUTO",
+                           config={"AUTO_DEVICE_LIST": "CPU,GPU"})
+```
+
+The Auto-device plugin supports query device optimization capabilities in metric.
+
+| Parameter name | Parameter values |
+| --- | --- |
+| "OPTIMIZATION_CAPABILITIES" | Auto-Device capabilities |
+
+### Enumerating Devices and Selection Logic
+
+The Inference Engine now features a dedicated API to enumerate devices and their capabilities. See the [Hello Query Device Python Sample](../../../inference_engine/ie_bridges/python/sample_hello_query_device_README.html) for code.
+
+This is the example output from the sample (truncated to device names only):
+
+```python
+./hello_query_device
+
+Available devices:
+    Device: CPU
+...
+    Device: GPU.0
+...
+    Device: GPU.1
+```
+
+### Default Auto-Device Selection Logic
+
+With the 2021.4 release, the Auto-Device selects the most suitable device using the following default logic:
+
+1. Check if dGPU (discrete), iGPU (integrated) and CPU devices are available
+2. Get the precision of the input model, such as FP32
+3. According to the priority of dGPU, iGPU, and CPU (in this order), if the device supports the precision of the input network, select it as the most suitable device
+
+For example, CPU, dGPU and iGPU can support the following precision and optimization capabilities:
+
+| Device | OPTIMIZATION_CAPABILITIES |
+| --- | --- |
+| CPU | WINOGRAD FP32 FP16 INT8 BIN |
+| dGPU | FP32 BIN BATCHED_BLOB FP16 INT8 |
+| iGPU | FP32 BIN BATCHED_BLOB FP16 INT8 |
+
+* When the application uses the Auto-device to run FP16 IR on a system with CPU, dGPU and iGPU, Auto-device will offload this workload to dGPU.
+* When the application uses the Auto-device to run FP16 IR on a system with CPU and iGPU, Auto-device will offload this workload to iGPU.
+* When the application uses the Auto-device to run WINOGRAD-enabled IR on a system with CPU, dGPU and iGPU, Auto-device will offload this workload to CPU.
+
+In cases when loading the network to dGPU or iGPU fails, CPU is the fall-back choice.
+
+To show the capabilities for a specific device, query the OPTIMIZATION_CAPABILITIES metric:
+
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+ie.get_metric(device_name=device,
+              metric_name="OPTIMIZATION_CAPABILITIES")
+```
+
+### Configuring the Individual Devices and Creating the Auto-Device on Top
+
+It is possible to configure each individual device as usual and create the "AUTO" device on top:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_model)
+
+cpu_config = {}
+gpu_config = {}
+
+ie.set_config(config=cpu_config, device_name="CPU")
+ie.set_config(config=gpu_config, device_name="GPU")
+
+# Load the network to the AUTO device
+exec_net = ie.load_network(network=net, device_name="AUTO")
+```
+
+Alternatively, you can combine all the individual device settings into single config file and load it, allowing the Auto-device plugin to parse and apply it to the right devices. See the code example here:
+
+```python
+from openvino.inference_engine import IECore
+
+# Init the Inference Engine Core
+ie = IECore()
+
+# Read a network in IR or ONNX format
+net = ie.read_network(model=path_to_model)
+
+full_config = {}
+
+# Load the network to the AUTO device
+exec_net = ie.load_network(network=net, device_name="AUTO", config=full_config)
+```
+
+### Using the Auto-Device with OpenVINO Samples and Benchmark App
+
+Note that every OpenVINO sample or application that supports the "-d" (which stands for "device") command-line option transparently accepts the Auto-device. The Benchmark Application is the best example of the optimal usage of the Auto-device. You do not need to set the number of requests and CPU threads, as the application provides optimal out-of-the-box performance. Below is the example command-line to evaluate AUTO performance with that:
+
+@sphinxdirective
+.. tab:: Package, Docker, open-source installation
+
+   .. code-block:: sh
+
+      ./benchmark_app.py –d AUTO –m <model>
+
+.. tab:: pip installation
+
+    .. code-block:: sh
+
+      benchmark_app –d AUTO –m <model>
+
+@endsphinxdirective
+
+You can also use the auto-device with limit device choice:
+
+@sphinxdirective
+.. tab:: Package, Docker, open-source installation
+
+   .. code-block:: sh
+
+      ./benchmark_app.py –d AUTO:CPU,GPU –m <model>
+
+.. tab:: pip installation
+
+    .. code-block:: sh
+
+      benchmark_app –d AUTO:CPU,GPU –m <model>
+
+@endsphinxdirective
+
+> **NOTE**: If you installed OpenVINO with pip, use `benchmark_app -d AUTO:CPU,GPU -m <model>`
--- a/docs/IE_DG/supported_plugins/CPU.md
+++ b/docs/IE_DG/supported_plugins/CPU.md
@@ -1,8 +1,8 @@
 CPU Plugin {#openvino_docs_IE_DG_supported_plugins_CPU}
 =======

-## Introducing CPU Plugin
-The CPU plugin was developed in order to provide opportunity for high performance scoring of neural networks on CPU, using the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN).
+## Introducing the CPU Plugin
+The CPU plugin was developed to achieve high performance of neural networks on CPU, using the Intel® Math Kernel Library for Deep Neural Networks (Intel® MKL-DNN).

 Currently, the CPU plugin uses Intel® Threading Building Blocks (Intel® TBB) in order to parallelize calculations. Please refer to the [Optimization Guide](../../optimization_guide/dldt_optimization_guide.md) for associated performance considerations.

@@ -10,56 +10,59 @@ The set of supported layers can be expanded with [the Extensibility mechanism](.

 ## Supported Platforms

-OpenVINO™ toolkit is officially supported and validated on the following platforms:
+OpenVINO™ toolkit, including the CPU plugin, is officially supported and validated on the following platforms:

 | Host              | OS (64-bit)                              |
 | :---              | :---                                     |
-| Development       | Ubuntu* 18.04, CentOS* 7.5, MS Windows* 10 |
-| Target            | Ubuntu* 18.04, CentOS* 7.5, MS Windows* 10 |
+| Development       | Ubuntu* 18.04 or 20.04, CentOS* 7.6, MS Windows* 10, macOS* 10.15 |
+| Target            | Ubuntu* 18.04 or 20.04, CentOS* 7.6, MS Windows* 10, macOS* 10.15 |

-The CPU Plugin supports inference on Intel® Xeon® with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel® Core™
+The CPU plugin supports inference on Intel® Xeon® with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel® Core™
 Processors with Intel® AVX2, Intel Atom® Processors with Intel® Streaming SIMD Extensions (Intel® SSE).

-You can use `-pc` the flag for samples to know which configuration is used by some layer.
-This flag shows execution statistics that you can use to get information about layer name,
-execution status, layer type, execution time, and the type of the execution primitive.
+You can use the `-pc` flag for samples to know which configuration is used by a layer.
+This flag shows execution statistics that you can use to get information about layer name, layer type, 
+execution status, execution time, and the type of the execution primitive.

 ## Internal CPU Plugin Optimizations

-CPU plugin supports several graph optimization algorithms, such as fusing or removing layers.
+The CPU plugin supports several graph optimization algorithms, such as fusing or removing layers.
 Refer to the sections below for details.

 > **NOTE**: For layer descriptions, see the [IR Notation Reference](../../ops/opset.md).

 ### Lowering Inference Precision

-CPU plugin follows default optimization approach. This approach means that inference is made with lower precision if it is possible on a given platform to reach better performance with acceptable range of accuracy.
+The CPU plugin follows a default optimization approach. This approach means that inference is made with lower precision if it is possible on a given platform to reach better performance with an acceptable range of accuracy.

 > **NOTE**: For details, see the [Using Bfloat16 Inference](../Bfloat16Inference.md).

 ### Fusing Convolution and Simple Layers

-Merge of a Convolution layer and any of the simple layers listed below:
+Merge of a convolution layer and any of the simple layers listed below:
 - Activation: ReLU, ELU, Sigmoid, Clamp
 - Depthwise: ScaleShift, PReLU
 - FakeQuantize

 > **NOTE**: You can have any number and order of simple layers.

-A combination of a Convolution layer and simple layers results in a single fused layer called 
-*Convolution*:    
+A combination of a convolution layer and simple layers results in a single fused layer called 
+*Convolution*:
+
 ![conv_simple_01]


 ### Fusing Pooling and FakeQuantize Layers

 A combination of Pooling and FakeQuantize layers results in a single fused layer called *Pooling*:  
+
 ![pooling_fakequant_01]

 ### Fusing FullyConnected and Activation Layers

 A combination of FullyConnected and Activation layers results in a single fused layer called 
-*FullyConnected*:    
+*FullyConnected*:
+
 ![fullyconnected_activation_01]


@@ -76,16 +79,19 @@ layer and simple layers results in a single layer called *Convolution* (or *Bina

 ### Fusing Convolution and Sum Layers

-A combination of Convolution, Simple, and Eltwise layers with the sum operation results in a single layer called  *Convolution*:  
+A combination of convolution, simple, and Eltwise layers with the sum operation results in a single layer called *Convolution*:  
+
 ![conv_sum_relu_01]

 ### Fusing a Group of Convolutions

-If a topology contains the following pipeline, a CPU plugin merges Split, Convolution, and Concatenation layers  into a single Convolution layer with the group parameter:   
-> **NOTE**: Parameters of the Convolution layers must coincide.
+If a topology contains the following pipeline, a CPU plugin merges split, convolution, and concatenation layers into a single convolution layer with the group parameter:   

 ![group_convolutions_01]

+> **NOTE**: Parameters of the convolution layers must coincide.
+
+
 ### Removing a Power Layer

 CPU plugin removes a Power layer from a topology if it has the following parameters:
@@ -97,7 +103,7 @@ CPU plugin removes a Power layer from a topology if it has the following paramet
 ## Supported Configuration Parameters

 The plugin supports the configuration parameters listed below.
-All parameters must be set with the <code>InferenceEngine::Core::LoadNetwork()</code> method.
+All parameters must be set with the `InferenceEngine::Core::LoadNetwork()` method.
 When specifying key values as raw strings (that is, when using Python API), omit the `KEY_` prefix.
 Refer to the OpenVINO samples for usage examples: [Benchmark App](../../../inference-engine/samples/benchmark_app/README.md).

--- a/docs/IE_DG/supported_plugins/Device_Plugins.md
+++ b/docs/IE_DG/supported_plugins/Device_Plugins.md
@@ -0,0 +1,35 @@
+# Device Plugin Support {#openvino_docs_IE_DG_Device_Plugins}
+
+@sphinxdirective
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   openvino_docs_IE_DG_InferenceEngine_QueryAPI
+   openvino_docs_IE_DG_supported_plugins_CPU
+   openvino_docs_IE_DG_supported_plugins_GPU
+   openvino_docs_IE_DG_supported_plugins_VPU
+   openvino_docs_IE_DG_supported_plugins_GNA
+   openvino_docs_IE_DG_supported_plugins_AUTO
+   openvino_docs_IE_DG_supported_plugins_HETERO
+   openvino_docs_IE_DG_supported_plugins_MULTI
+         
+@endsphinxdirective
+
+Inference Engine uses a plugin architecture. Inference Engine plugin is a software component that contains complete implementation for inference on a certain Intel® hardware device: CPU, GPU, VPU, GNA, etc. Each plugin implements the unified API and provides additional hardware-specific APIs.
+
+The Inference Engine provides capabilities to infer deep learning models on the following device types with corresponding plugins:
+
+| Plugin                                   | Device types                                                                                                                                                |
+|------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|[GPU plugin](GPU.md)            |Intel&reg; Processor Graphics, including Intel&reg; HD Graphics and Intel&reg; Iris&reg; Graphics                                                            |
+|[CPU plugin](CPU.md)              |Intel&reg; Xeon&reg; with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel&reg; Core&trade; Processors with Intel&reg; AVX2, Intel&reg; Atom&reg; Processors with Intel® Streaming SIMD Extensions (Intel® SSE) |
+|[VPU plugins](VPU.md) (available in the Intel® Distribution of OpenVINO™ toolkit)            |Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X, Intel® Vision Accelerator Design with Intel® Movidius™ VPUs                                                                                           |
+|[GNA plugin](GNA.md) (available in the Intel® Distribution of OpenVINO™ toolkit)              |Intel&reg; Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel&reg; Pentium&reg; Silver J5005 Processor, Intel&reg; Pentium&reg; Silver N5000 Processor, Intel&reg; Celeron&reg; J4005 Processor, Intel&reg; Celeron&reg; J4105 Processor, Intel&reg; Celeron&reg; Processor N4100, Intel&reg; Celeron&reg; Processor N4000, Intel&reg; Core&trade; i3-8121U Processor, Intel&reg; Core&trade; i7-1065G7 Processor, Intel&reg; Core&trade; i7-1060G7 Processor, Intel&reg; Core&trade; i5-1035G4 Processor, Intel&reg; Core&trade; i5-1035G7 Processor, Intel&reg; Core&trade; i5-1035G1 Processor, Intel&reg; Core&trade; i5-1030G7 Processor, Intel&reg; Core&trade; i5-1030G4 Processor, Intel&reg; Core&trade; i3-1005G1 Processor, Intel&reg; Core&trade; i3-1000G1 Processor, Intel&reg; Core&trade; i3-1000G4 Processor|
+|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel&reg; devices in parallel    |   
+|[Auto-Device plugin](AUTO.md) |Auto-Device plugin enables selecting Intel&reg; device for inference automatically |   
+|[Heterogeneous plugin](HETERO.md) |Heterogeneous plugin enables automatic inference splitting between several Intel&reg; devices (for example if a device doesn't [support certain layers](#supported-layers)).                                                           |
+
+Devices similar to the ones we have used for benchmarking can be accessed using [Intel® DevCloud for the Edge](https://devcloud.intel.com/edge/), a remote development environment with access to Intel® hardware and the latest versions of the Intel® Distribution of the OpenVINO™ Toolkit. [Learn more](https://devcloud.intel.com/edge/get_started/devcloud/) or [Register here](https://inteliot.force.com/DevcloudForEdge/s/).
+
--- a/docs/IE_DG/supported_plugins/GNA.md
+++ b/docs/IE_DG/supported_plugins/GNA.md
@@ -1,12 +1,11 @@
 # GNA Plugin {#openvino_docs_IE_DG_supported_plugins_GNA}
-
 ## Introducing the GNA Plugin

-Intel® Gaussian & Neural Accelerator is a low-power neural coprocessor for continuous inference at the edge.
+The Intel® Gaussian & Neural Accelerator is a low-power neural coprocessor for continuous inference at the edge.

-Intel® GNA is not intended to replace classic inference devices such as
-CPU, graphics processing unit (GPU), or vision processing unit (VPU). It is designed for offloading 
-continuous inference workloads including but not limited to noise reduction or speech recognition 
+Intel® GNA is not intended to replace typical inference devices such as the
+CPU, graphics processing unit (GPU), or vision processing unit (VPU). It is designed for offloading
+continuous inference workloads including but not limited to noise reduction or speech recognition
 to save power and free CPU resources.

 The GNA plugin provides a way to run inference on Intel® GNA, as well as in the software execution mode on CPU.
@@ -19,33 +18,185 @@ Devices with Intel® GNA support:

 * [Amazon Alexa\* Premium Far-Field Developer Kit](https://developer.amazon.com/en-US/alexa/alexa-voice-service/dev-kits/amazon-premium-voice)

-* [Intel® Pentium® Silver Processors N5xxx, J5xxx and Intel® Celeron® Processors N4xxx, J4xxx](https://ark.intel.com/content/www/us/en/ark/products/codename/83915/gemini-lake.html):
-	- Intel® Pentium® Silver J5005 Processor
-	- Intel® Pentium® Silver N5000 Processor
-	- Intel® Celeron® J4005 Processor
-	- Intel® Celeron® J4105 Processor
-	- Intel® Celeron® Processor N4100
-	- Intel® Celeron® Processor N4000
+* [Intel® Pentium® Processors N6xxx, J6xxx, Intel® Celeron® Processors N6xxx, J6xxx and Intel Atom® x6xxxxx (formerly codenamed Elkhart Lake)](https://ark.intel.com/content/www/us/en/ark/products/codename/128825/products-formerly-elkhart-lake.html)

-* [Intel® Core™ Processors (formerly codenamed Cannon Lake)](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html):
-Intel® Core™ i3-8121U Processor
+* [Intel® Core™ Processors (formerly codenamed Cannon Lake)](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html)

 * [10th Generation Intel® Core™ Processors (formerly codenamed Ice Lake)](https://ark.intel.com/content/www/us/en/ark/products/codename/74979/ice-lake.html):
-	- Intel® Core™ i7-1065G7 Processor
-	- Intel® Core™ i7-1060G7 Processor
-	- Intel® Core™ i5-1035G4 Processor
-	- Intel® Core™ i5-1035G7 Processor
-	- Intel® Core™ i5-1035G1 Processor
-	- Intel® Core™ i5-1030G7 Processor
-	- Intel® Core™ i5-1030G4 Processor
-	- Intel® Core™ i3-1005G1 Processor
-	- Intel® Core™ i3-1000G1 Processor
-	- Intel® Core™ i3-1000G4 Processor
+	
+* [11th Generation Intel® Core™ Processors (formerly codenamed Tiger Lake)](https://ark.intel.com/content/www/us/en/ark/products/codename/88759/tiger-lake.html).

-* All [11th Generation Intel® Core™ Processors (formerly codenamed Tiger Lake)](https://ark.intel.com/content/www/us/en/ark/products/codename/88759/tiger-lake.html).
+* [12th Generation Intel® Core™ Processors (formerly codenamed Alder Lake)](https://ark.intel.com/content/www/us/en/ark/products/codename/147470/products-formerly-alder-lake.html).

 > **NOTE**: On platforms where Intel® GNA is not enabled in the BIOS, the driver cannot be installed, so the GNA plugin uses the software emulation mode only.

+## Intel® GNA Generational Differences
+
+The first and second versions of Intel® GNA found in 10th and 11th generation Intel® Core™ Processors may be considered to be functionally equivalent.  Intel® GNA 2.0 provided performance improvement with respect to Intel® GNA 1.0.  Starting with 12th Generation Intel® Core™ Processors (formerly codenamed Alder Lake), support for Intel® GNA 3.0 features is being added.
+
+In the rest of this documentation, "GNA 2.0" refers to Intel® GNA hardware delivered on 10th and 11th generation Intel® Core™ processors, and the term "GNA 3.0" will be used to refer to GNA hardware delivered on 12th generation Intel® Core™ processors.
+
+Initially, a limited subset of Intel® GNA 3.0 features are added to the previous feature set including the following:
+
+* **2D VALID Convolution With Small 2D Kernels:**  Two-dimensional convolutions with the following kernel dimensions [H,W] are supported:  [1,1], [2,2], [3,3], [2,1], [3,1], [4,1], [5,1], [6,1], [7,1], [1,2], or [1,3].  Input tensor dimensions are limited to [1,8,16,16] <= [N,C,H,W] <= [1,120,384,240].  Up to 384 channels C may be used with a subset of kernel sizes (see table below).  Up to 256 kernels (output channels) are supported.  Pooling is limited to pool shapes of [1,1], [2,2], or [3,3].  Not all combinations of kernel shape and input tensor shape are supported (see the tables below for exact limitations).
+
+The tables below show that the exact limitation on the input tensor width W depends on the number of input channels C (indicated as Ci below) and the kernel shape.  There is much more freedom to choose the input tensor height and number of output channels.
+
+## Initially Supported Subset of Intel® GNA 2D Convolutions
+
+The following tables provide a more explicit representation of the Intel(R) GNA 3.0 2D convolution operations initially supported.  The limits depend strongly on number of input tensor channels (Ci) and the input tensor width (W).  Other factors are kernel height (KH), kernel width (KW), pool height (PH), pool width (PW), horizontal pool step (SH), and vertical pool step (PW).  For example, the first table shows that for a 3x3 kernel with max pooling, only square pools are supported, and W is limited to 87 when there are 64 input channels.
+
+**Table of Maximum Input Tensor Widths (W) vs. Rest of Parameters** (Input and Kernel Precision: 2 bytes)
+
+|KH|KW|PH|PW|SH|SW|H|W<br>Ci=8<br>Co=256|W<br>Ci=16<br>Co=256|W<br>Ci=32<br>Co=256|W<br>Ci=64<br>Co=256|W<br>Ci=128<br>Co=256|W<br>Ci=256<br>Co=256|W<br>Ci=384<br>Co=256|
+|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|
+|1|1|1|1|1|1|128|240|240|240|240|240|240|170|
+|1|1|1|1|1|1|256|240|240|240|240|240|128|85|
+|1|1|1|1|1|1|384|240|240|240|240|170|85|56|
+|1|2|1|1|1|1|128|240|240|240|240|   |  |  |
+|1|2|1|1|1|1|256|240|240|240|240|   |  |  |
+|1|2|1|1|1|1|384|240|240|240|240|   |  |  |
+|1|3|1|1|1|1|128|240|240|240|240|   |  |  |
+|1|3|1|1|1|1|256|240|240|240|240|   |  |  |
+|1|3|1|1|1|1|384|240|240|240|240|   |  |  |
+|2|1|1|1|1|1|128|192|192|192|192|192|192|128|
+|2|1|1|1|1|1|256|192|192|192|192|192|128|85|
+|2|1|1|1|1|1|384|192|192|192|192|170|85|56|
+|2|2|1|1|1|1|128|193|193|193|193|   |  |  |
+|2|2|1|1|1|1|256|193|193|193|193|   |  |  |
+|2|2|1|1|1|1|384|193|193|193|193|   |  |  |
+|2|2|2|2|1|1|128|193|193|192|179|   |  |  |
+|2|2|2|2|1|1|256|193|193|192|179|   |  |  |
+|2|2|2|2|1|1|384|193|193|192|179|   |  |  |
+|2|2|2|2|1|2|128|193|193|192|179|   |  |  |
+|2|2|2|2|1|2|256|193|193|192|179|   |  |  |
+|2|2|2|2|1|2|384|193|193|192|179|   |  |  |
+|2|2|2|2|2|1|128|193|193|192|179|   |  |  |
+|2|2|2|2|2|1|256|193|193|192|179|   |  |  |
+|2|2|2|2|2|1|384|193|193|192|179|   |  |  |
+|2|2|2|2|2|2|128|193|193|192|179|   |  |  |
+|2|2|2|2|2|2|256|193|193|192|179|   |  |  |
+|2|2|2|2|2|2|384|193|193|192|179|   |  |  |
+|3|1|1|1|1|1|128|128|128|128|128|128|85|42|
+|3|1|1|1|1|1|256|128|128|128|128|128|85|42|
+|3|1|1|1|1|1|384|128|128|128|128|128|85|42|
+|3|3|1|1|1|1|128|130|130|130|87|   |  |  |
+|3|3|1|1|1|1|256|130|130|130|87|   |  |  |
+|3|3|1|1|1|1|384|130|130|130|87|   |  |  |
+|3|3|2|2|1|1|128|130|130|126|87|   |  |  |
+|3|3|2|2|1|1|256|130|130|126|87|   |  |  |
+|3|3|2|2|1|1|384|130|130|126|87|   |  |  |
+|3|3|2|2|1|2|128|130|130|126|87|   |  |  |
+|3|3|2|2|1|2|256|130|130|126|87|   |  |  |
+|3|3|2|2|1|2|384|130|130|126|87|   |  |  |
+|3|3|2|2|2|1|128|130|130|126|87|   |  |  |
+|3|3|2|2|2|1|256|130|130|126|87|   |  |  |
+|3|3|2|2|2|1|384|130|130|126|87|   |  |  |
+|3|3|2|2|2|2|128|130|130|126|87|   |  |  |
+|3|3|2|2|2|2|256|130|130|126|87|   |  |  |
+|3|3|2|2|2|2|384|130|130|126|87|   |  |  |
+|3|3|3|3|1|1|128|130|128|118|87|   |  |  |
+|3|3|3|3|1|1|256|130|128|118|87|   |  |  |
+|3|3|3|3|1|1|384|130|128|118|87|   |  |  |
+|3|3|3|3|1|2|128|130|128|118|87|   |  |  |
+|3|3|3|3|1|2|256|130|128|118|87|   |  |  |
+|3|3|3|3|1|2|384|130|128|118|87|   |  |  |
+|3|3|3|3|1|3|128|130|128|118|87|   |  |  |
+|3|3|3|3|1|3|256|130|128|118|87|   |  |  |
+|3|3|3|3|1|3|384|130|128|118|87|   |  |  |
+|3|3|3|3|2|1|128|130|128|118|87|   |  |  |
+|3|3|3|3|2|1|256|130|128|118|87|   |  |  |
+|3|3|3|3|2|1|384|130|128|118|87|   |  |  |
+|3|3|3|3|2|2|128|130|128|118|87|   |  |  |
+|3|3|3|3|2|2|256|130|128|118|87|   |  |  |
+|3|3|3|3|2|2|384|130|128|118|87|   |  |  |
+|3|3|3|3|2|3|128|130|128|118|87|   |  |  |
+|3|3|3|3|2|3|256|130|128|118|87|   |  |  |
+|3|3|3|3|2|3|384|130|128|118|87|   |  |  |
+|3|3|3|3|3|1|128|130|128|118|87|   |  |  |
+|3|3|3|3|3|1|256|130|128|118|87|   |  |  |
+|3|3|3|3|3|1|384|130|128|118|87|   |  |  |
+|3|3|3|3|3|2|128|130|128|118|87|   |  |  |
+|3|3|3|3|3|2|256|130|128|118|87|   |  |  |
+|3|3|3|3|3|2|384|130|128|118|87|   |  |  |
+|3|3|3|3|3|3|128|130|128|118|87|   |  |  |
+|3|3|3|3|3|3|256|130|128|118|87|   |  |  |
+|3|3|3|3|3|3|384|130|128|118|87|   |  |  |
+|4|1|1|1|1|1|128|96|96|96|96|96|64|32|
+|4|1|1|1|1|1|256|96|96|96|96|96|64|32|
+|4|1|1|1|1|1|384|96|96|96|96|96|64|32|
+|5|1|1|1|1|1|128|76|76|76|76|51|25|  |
+|5|1|1|1|1|1|256|76|76|76|76|51|25|  |
+|5|1|1|1|1|1|384|76|76|76|76|51|25|  |
+|6|1|1|1|1|1|128|64|64|64|64|42|21|  |
+|6|1|1|1|1|1|256|64|64|64|64|42|21|  |
+|6|1|1|1|1|1|384|64|64|64|64|42|21|  |
+|7|1|1|1|1|1|128|54|54|54|54|36|  |  |
+|7|1|1|1|1|1|256|54|54|54|54|36|  |  |
+|7|1|1|1|1|1|384|54|54|54|54|36|  |  |
+
+**Table of Maximum Input Tensor Widths (W) vs. Rest of Parameters** (Input and Kernel Precision: 1 bytes)
+
+|KH|KW|PH|PW|SH|SW|H|W<br>Ci=8<br>Co=256|W<br>Ci=16<br>Co=256|W<br>Ci=32<br>Co=256|W<br>Ci=64<br>Co=256|W<br>Ci=128<br>Co=256|W<br>Ci=256<br>Co=256|W<br>Ci=384<br>Co=256|
+|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|
+|1|1|1|1|1|1|128|240|240|240|240|240|240|240|
+|1|1|1|1|1|1|256|240|240|240|240|240|240|170|
+|1|1|1|1|1|1|384|240|240|240|240|240|170|113|
+|1|2|1|1|1|1|128|240|240|240|240|240|240|240|
+|1|2|1|1|1|1|256|240|240|240|240|240|240|170|
+|1|2|1|1|1|1|384|240|240|240|240|240|170|113|
+|1|3|1|1|1|1|128|240|240|240|240|240|   |   |   
+|1|3|1|1|1|1|256|240|240|240|240|240|   |   |
+|1|3|1|1|1|1|384|240|240|240|240|240|   |   |
+|2|1|1|1|1|1|128|192|192|192|192|192|192|192|
+|2|1|1|1|1|1|256|192|192|192|192|192|192|170|
+|2|1|1|1|1|1|384|192|192|192|192|192|170|113|
+|2|2|1|1|1|1|128|193|193|193|193|193|193|129|
+|2|2|1|1|1|1|256|193|193|193|193|193|193|129|
+|2|2|1|1|1|1|384|193|193|193|193|193|170|113|
+|3|1|1|1|1|1|128|128|128|128|128|128|128|85|
+|3|1|1|1|1|1|256|128|128|128|128|128|128|85|
+|3|1|1|1|1|1|384|128|128|128|128|128|128|85|
+|3|3|1|1|1|1|128|130|130|130|130|87 |   |  |   
+|3|3|1|1|1|1|256|130|130|130|130|87 |   |  |
+|3|3|1|1|1|1|384|130|130|130|130|87 |   |  |
+|4|1|1|1|1|1|128|96|96|96|96|96|96|64|
+|4|1|1|1|1|1|256|96|96|96|96|96|96|64|
+|4|1|1|1|1|1|384|96|96|96|96|96|96|64|
+|5|1|1|1|1|1|128|76|76|76|76|76|51|51|
+|5|1|1|1|1|1|256|76|76|76|76|76|51|51|
+|5|1|1|1|1|1|384|76|76|76|76|76|51|51|
+|6|1|1|1|1|1|128|64|64|64|64|64|42|21|
+|6|1|1|1|1|1|256|64|64|64|64|64|42|21|
+|6|1|1|1|1|1|384|64|64|64|64|64|42|21|
+|7|1|1|1|1|1|128|54|54|54|54|54|36|18|
+|7|1|1|1|1|1|256|54|54|54|54|54|36|18|
+|7|1|1|1|1|1|384|54|54|54|54|54|36|18|
+
+
+> **NOTE**:  The above limitations only apply to the new hardware 2D convolution operation.  When possible, the Intel® GNA plugin graph compiler flattens 2D convolutions so that the second generation Intel® GNA 1D convolution operations (without these limitations) may be used.  The plugin will also flatten 2D convolutions regardless of the sizes  if GNA 2.0 compilation target is selected (see below).
+
+## Intel® GNA Forward and Backward Compatibility
+
+In the general case, there is no guarantee that a model compiled for GNA 2.0 will run on GNA 3.0, or vice versa.
+
+However, in most cases, networks compiled for GNA 2.0 will run as expected on GNA 3.0, although the performance may be worse compared to the case when a network is compiled specifically for the latter.  The exception is networks with convolutions with the number of filters greater than 8192 (see the <a href="#models-and-layers-limitations">Models and Layers Limitations</a> section).
+
+Networks compiled for GNA 3.0 should run on GNA 2.0 with incompatible layers emulated on CPU.
+
+You can use the following options `KEY_GNA_EXEC_TARGET` and `KEY_GNA_COMPILE_TARGET` options  to check interoperability (see the <a href="#supported-configuration-parameters">Supported Configuration Parameters</a> section below):
+
+@sphinxdirective
+.. tab:: C++
+
+   ``KEY_GNA_EXEC_TARGET``,  ``KEY_GNA_COMPILE_TARGET``
+
+.. tab:: Python
+
+   ``GNA_EXEC_TARGET``,  ``GNA_COMPILE_TARGET`` 
+
+@endsphinxdirective
+
 ## Drivers and Dependencies

 Intel® GNA hardware requires a driver to be installed on the system.
@@ -56,11 +207,10 @@ Intel® GNA hardware requires a driver to be installed on the system.
 * Windows\* OS:
 Intel® GNA driver for Windows is available through Windows Update\*

-## Models and Layers Limitations
+## <a name="models-and-layers-limitations">Models and Layers Limitations</a>

 Because of specifics of hardware architecture, Intel® GNA supports a limited set of layers, their kinds and combinations.
-For example, you should not expect the GNA Plugin to be able to run computer vision models, except those specifically adapted 
-for the GNA Plugin, because the plugin does not fully support 2D convolutions.
+For example, you should not expect the GNA Plugin to be able to run computer vision models, except those specifically adapted for the GNA Plugin, because the plugin does not fully support 2D convolutions.

 For the list of supported layers, see the **GNA** column of the **Supported Layers** section in [Supported Devices](Supported_Devices.md).

@@ -68,12 +218,13 @@ Limitations include:

 - Only 1D convolutions are natively supported.
 - The number of output channels for convolutions must be a multiple of 4.
+- The maximum number of filters is 65532 for GNA 2.0 and 8192 for GNA 3.0.
 - Permute layer support is limited to the cases where no data reordering is needed or when reordering is happening for two dimensions, at least one of which is not greater than 8.
 - Splits and concatenations are supported for continuous portions of memory (e.g., split of 1,2,3,4 to 1,1,3,4 and 1,1,3,4 or concats of 1,2,3,4 and 1,2,3,5 to 2,2,3,4).

-#### Experimental Support for 2D Convolutions
+### Support for 2D Convolutions in Previous Generations of GNA Hardware

-The Intel® GNA hardware natively supports only 1D convolutions.
+The Intel® GNA 1.0 and 2.0 hardware natively supports only 1D convolutions.

 However, 2D convolutions can be mapped to 1D when a convolution kernel moves in a single direction. GNA Plugin performs such a transformation for Kaldi `nnet1` convolution. From this perspective, the Intel® GNA hardware convolution operation accepts an `NHWC` input and produces an `NHWC` output. Because OpenVINO™ only supports the `NCHW` layout, you may need to insert `Permute` layers before or after convolutions.

@@ -83,54 +234,157 @@ For example, the Kaldi model optimizer inserts such a permute after convolution

 Intel® GNA essentially operates in the low-precision mode, which represents a mix of 8-bit (`I8`), 16-bit (`I16`), and 32-bit (`I32`) integer computations. Outputs calculated using a reduced integer precision are different from the scores calculated using the floating point format, for example, `FP32` outputs calculated on CPU using the Inference Engine [CPU Plugin](CPU.md).

-Unlike other plugins supporting low-precision execution, the GNA plugin calculates quantization factors at the model loading time, so you can run a model without calibration.
+Unlike other plugins supporting low-precision execution, the GNA plugin can calculate quantization factors at the model loading time, so you can run a model without calibration using the [Post-Training Optimization Tool](@ref pot_README).
+However, this mode may not provide satisfactory accuracy because the internal quantization algorithm is based on heuristics which may or may not be efficient, depending on the model and dynamic range of input data.
+
+Starting with 2021.4 release of OpenVINO, GNA plugin users are encouraged to use the [POT API Usage sample for GNA](@ref pot_sample_speech_README) to get a model with quantization hints based on statistics for the provided dataset.

 ## <a name="execution-modes">Execution Modes</a>

-| Mode | Description |
-| :---------------------------------| :---------------------------------------------------------|
-| `GNA_AUTO` | Uses Intel® GNA if available, otherwise uses software execution mode on CPU. |
-| `GNA_HW` | Uses Intel® GNA if available, otherwise raises an error. |
-| `GNA_SW` | *Deprecated*. Executes the GNA-compiled graph on CPU performing calculations in the same precision as the Intel® GNA, but not in the bit-exact mode. |
-| `GNA_SW_EXACT` | Executes the GNA-compiled graph on CPU performing calculations in the same precision as the Intel® GNA in the bit-exact mode. |
-| `GNA_SW_FP32` | Executes the GNA-compiled graph on CPU but substitutes parameters and calculations from low precision to floating point (`FP32`). |
+@sphinxdirective
+.. tab:: C++

-## Supported Configuration Parameters
+   ============================  ==============================================================================================================================================
+   Mode                          Description
+   ============================  ==============================================================================================================================================
+   ``KEY_GNA_AUTO``              Uses Intel® GNA if available, otherwise uses software execution mode on CPU. 
+   ``KEY_GNA_HW``                Uses Intel® GNA if available, otherwise raises an error. 
+   ``KEY_GNA_SW``                *Deprecated*. Executes the GNA-compiled graph on CPU performing calculations in the same precision as the Intel® GNA, but not in the bit-exact mode. 
+   ``KEY_GNA_SW_EXACT``          Executes the GNA-compiled graph on CPU performing calculations in the same precision as the Intel® GNA in the bit-exact mode. 
+   ``KEY_GNA_HW_WITH_SW_FBACK``  Uses Intel® GNA if available, otherwise raises an error. If the hardware queue is not empty, automatically falls back to CPU in the bit-exact mode. 
+   ``KEY_GNA_SW_FP32``           Executes the GNA-compiled graph on CPU but substitutes parameters and calculations from low precision to floating point (``FP32``). 
+   ============================  ==============================================================================================================================================

-The plugin supports the configuration parameters listed below.
-The parameters are passed as `std::map<std::string, std::string>` on `InferenceEngine::Core::LoadNetwork` or `InferenceEngine::SetConfig`.
+.. tab:: Python

-You can change the `KEY_GNA_DEVICE_MODE` parameter at run time using `InferenceEngine::ExecutableNetwork::SetConfig`, which works for any value excluding `GNA_SW_FP32`. This enables you to switch the
-execution between software emulation mode and hardware emulation mode after the model is loaded.
+   ========================  ==============================================================================================================================================
+   Mode                      Description
+   ========================  ==============================================================================================================================================
+   ``GNA_AUTO``              Uses Intel® GNA if available, otherwise uses software execution mode on CPU. 
+   ``GNA_HW``                Uses Intel® GNA if available, otherwise raises an error. 
+   ``GNA_SW``                *Deprecated*. Executes the GNA-compiled graph on CPU performing calculations in the same precision as the Intel® GNA, but not in the bit-exact mode. 
+   ``GNA_SW_EXACT``          Executes the GNA-compiled graph on CPU performing calculations in the same precision as the Intel® GNA in the bit-exact mode. 
+   ``GNA_HW_WITH_SW_FBACK``  Uses Intel® GNA if available, otherwise raises an error. If the hardware queue is not empty, automatically falls back to CPU in the bit-exact mode. 
+   ``GNA_SW_FP32``           Executes the GNA-compiled graph on CPU but substitutes parameters and calculations from low precision to floating point (``FP32``). 
+   ========================  ==============================================================================================================================================

-The parameter names below correspond to their usage through API keys, such as `GNAConfigParams::KEY_GNA_DEVICE_MODE` or `PluginConfigParams::KEY_PERF_COUNT`.
-When specifying key values as raw strings, that is, when using Python API, omit the `KEY_` prefix.
+@endsphinxdirective  

-| Parameter Name                    | Parameter Values                                          | Default Value     | Description                                                              |
-| :---------------------------------| :---------------------------------------------------------| :-----------| :------------------------------------------------------------------------|
-| `KEY_GNA_COMPACT_MODE`            | `YES`/`NO`                                                | `NO`       | Enables I/O buffers reuse to save space. Makes debugging harder.              |
-| `KEY_GNA_SCALE_FACTOR`            | `FP32` number                                             | 1.0         | Sets the scale factor to use for input quantization.                               |
-| `KEY_GNA_DEVICE_MODE`             | `GNA_AUTO`/`GNA_HW`/`GNA_SW_EXACT`/`GNA_SW_FP32` | `GNA_AUTO`  |  One of the modes described in <a href="#execution-modes">Execution Modes</a> |
-| `KEY_GNA_FIRMWARE_MODEL_IMAGE`    | `std::string`                                             | `""`        | Sets the name for the embedded model binary dump file.                                 |
-| `KEY_GNA_PRECISION`               | `I16`/`I8`                                                | `I16`       | Sets the preferred integer weight resolution for quantization. |
-| `KEY_PERF_COUNT`                  | `YES`/`NO`                                                | `NO`        | Turns on performance counters reporting.                                   |
-| `KEY_GNA_LIB_N_THREADS`           | 1-127 integer number                                      | 1           | Sets the number of GNA accelerator library worker threads used for inference computation in software modes.
+## <a name="supported-configuration-parameters">Supported Configuration Parameters</a>

+The plugin supports the configuration parameters listed below. The parameter names correspond to their usage through API keys, such as ``GNAConfigParams::KEY_GNA_DEVICE_MODE`` or ``PluginConfigParams::KEY_PERF_COUNT`` in C++ and ``GNA_DEVICE_MODE`` or ``PERF_COUNT`` in Python.
+
+@sphinxdirective
+.. tab:: C++
+
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | Parameter Name                   | Values                  | Default Value | Description                                                     |
+   +==================================+=========================+===============+=================================================================+
+   | ``KEY_GNA_EXEC_TARGET``          | ``TARGET_2_0``,         | *see below*   | Defines the execution target.                                   |
+   |                                  | ``TARGET_3_0``          |               |                                                                 |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_COMPILE_TARGET``       | ``TARGET_2_0``,         | *see below*   | Defines the compilation target.                                 |
+   |                                  | ``TARGET_3_0``          |               |                                                                 |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_COMPACT_MODE``         | ``YES``, ``NO``         | ``NO``        | Enables I/O buffers reuse to save space.                        |
+   |                                  |                         |               | Makes debugging harder.                                         |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_SCALE_FACTOR``         | FP32 number             | 1.0           | Sets the scale factor to use for input quantization.            |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_DEVICE_MODE``          | ``GNA_AUTO``,           | ``GNA_AUTO``  | One of the modes described                                      |
+   |                                  | ``GNA_HW``,             |               | in `Execution Modes <#execution-modes>`_.                       |
+   |                                  | ``GNA_HW_WITH_SW_FBACK``|               |                                                                 |
+   |                                  | ``GNA_SW_EXACT``,       |               |                                                                 |
+   |                                  | ``GNA_SW_FP32``         |               |                                                                 |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_FIRMWARE_MODEL_IMAGE`` | ``std::string``         | ``""``        | Sets the name for the embedded model binary dump file.          |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_PRECISION``            | ``I16``, ``I8``         | ``I16``       | Sets the preferred integer weight resolution for quantization   |
+   |                                  |                         |               | (ignored for models produced using POT).                        |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_PERF_COUNT``               | ``YES``, ``NO``         | ``NO``        | Turns on performance counters reporting.                        |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_LIB_N_THREADS``        | 1-127 integer number    | 1             | Sets the number of GNA accelerator library worker threads used  |
+   |                                  |                         |               | for inference computation in software modes.                    |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   
+   The parameters are passed as ``std::map<std::string, std::string>`` on ``InferenceEngine::Core::LoadNetwork`` or ``InferenceEngine::SetConfig``.
+
+   Normally, you do not need to select the execution target (``KEY_GNA_EXEC_TARGET``) and compilation target (``KEY_GNA_COMPILE_TARGET``). The default value for the execution target corresponds to available hardware, or latest hardware version supported by the plugin (i.e., GNA 3.0) if there is no GNA HW in the system. The compilation target is the same as the execution target by default. However, you may want to change the targets, for example, if you want to check how a model compiled for one generation would behave on the other generation (using the software emulation mode), or if you are willing to export a model for a specific version of GNA HW.
+   
+   You can change the ``KEY_GNA_DEVICE_MODE`` parameter at run time using ``InferenceEngine::ExecutableNetwork::SetConfig``, which works for any value excluding ``GNA_SW_FP32``. This enables you to switch the execution between software emulation mode and hardware execution mode after the model is loaded.
+
+.. tab:: Python
+
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | Parameter Name                   | Values                  | Default Value | Description                                                     |
+   +==================================+=========================+===============+=================================================================+
+   | ``GNA_EXEC_TARGET``              | ``TARGET_2_0``,         | _see below_   | Defines the execution target.                                   |
+   |                                  | ``TARGET_3_0``          |               |                                                                 |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``GNA_COMPILE_TARGET``           | ``TARGET_2_0``,         | _see below_   | Defines the compilation target.                                 |
+   |                                  | ``TARGET_3_0``          |               |                                                                 |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``GNA_COMPACT_MODE``             | ``YES``, ``NO``         | ``NO``        | Enables I/O buffers reuse to save space.                        |
+   |                                  |                         |               | Makes debugging harder.                                         |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``GNA_SCALE_FACTOR``             | FP32 number             | 1.0           | Sets the scale factor to use for input quantization.            |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``KEY_GNA_DEVICE_MODE``          | ``GNA_AUTO``,           | ``GNA_AUTO``  | One of the modes described                                      |
+   |                                  | ``GNA_HW``,             |               | in `Execution Modes <#execution-modes>`_.                       |
+   |                                  | ``GNA_HW_WITH_SW_FBACK``|               |                                                                 |
+   |                                  | ``GNA_SW_EXACT``,       |               |                                                                 |
+   |                                  | ``GNA_SW_FP32``         |               |                                                                 |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``GNA_FIRMWARE_MODEL_IMAGE``     | ``string``              | ``""``        | Sets the name for the embedded model binary dump file.          |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``GNA_PRECISION``                | ``I16``, ``I8``         | ``I16``       | Sets the preferred integer weight resolution for quantization   |
+   |                                  |                         |               | (ignored for models produced using POT).                        |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``PERF_COUNT``                   | ``YES``, ``NO``         | ``NO``        | Turns on performance counters reporting.                        |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+   | ``GNA_LIB_N_THREADS``            | 1-127 integer number    | 1             | Sets the number of GNA accelerator library worker threads used  |
+   |                                  |                         |               | for inference computation in software modes.                    |
+   +----------------------------------+-------------------------+---------------+-----------------------------------------------------------------+
+
+   The parameters are passed as strings to `IECore.load_network <api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.load_network>`_.
+
+   Normally, you do not need to select the execution target (``GNA_EXEC_TARGET``) and compilation target (``GNA_COMPILE_TARGET``). The default value for the execution target corresponds to available hardware, or latest hardware version supported by the plugin (i.e., GNA 3.0) if there is no GNA HW in the system. The compilation target is the same as the execution target by default. However, you may want to change the targets, for example, if you want to check how a model compiled for one generation would behave on the other generation (using the SW emulation mode), or if you are willing to export a model for a specific version of GNA HW.
+
+   You can change the ``GNA_DEVICE_MODE`` parameter at run time by sending a configuration dict to the `IECore.load_network <api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.load_network>`_ call, which works for any value excluding ``GNA_SW_FP32``. This enables you to switch the execution between software emulation mode and hardware execution mode after the model is loaded.
+
+@endsphinxdirective   
 ## How to Interpret Performance Counters

-As a result of collecting performance counters using `InferenceEngine::InferRequest::GetPerformanceCounts`, you can find various performance data about execution on GNA.
-Returned map stores a counter description as a key, and a counter value in the `realTime_uSec` field of the `InferenceEngineProfileInfo` structure. Current GNA implementation calculates counters for the whole utterance scoring and does not provide per-layer information. The API enables you to retrieve counter units in cycles, you can convert cycles to seconds as follows:
+With the following methods, you can collect performance counters that provides various performance data about execution on GNA:
+
+@sphinxdirective
+.. tab:: C++
+
+   ``InferenceEngine::InferRequest::GetPerformanceCounts`` 
+   
+   The returned map stores a counter description as a key, and a counter value in the ``realTime_uSec`` field of the ``InferenceEngineProfileInfo`` structure.
+
+
+.. tab:: Python
+
+   ``openvino.inference_engine.InferRequest.get_perf_counts`` 
+   
+   The returned map stores a counter description as a key, and a counter value in the ``real_time`` field. 
+
+@endsphinxdirective
+
+The current GNA implementation calculates counters for the whole utterance scoring and does not provide per-layer information. The API enables you to retrieve counter units in cycles, you can convert cycles to seconds as follows:

 ```
 seconds = cycles / frequency
 ```

-Refer to the table below to learn about the frequency of Intel® GNA inside a particular processor.
+Refer to the table below to learn about the frequency of Intel® GNA inside a particular processor:
 Processor | Frequency of Intel® GNA
 ---|---
-Intel® Ice Lake processors| 400MHz
-Intel® Core™ i3-8121U processor| 400MHz
-Intel® Gemini Lake  processors | 200MHz
+Intel® Core™ processors| 400MHz
+Intel® processors formerly codenamed Elkhart Lake | 200MHz

 Performance counters provided for the time being:

@@ -142,16 +396,38 @@ Performance counters provided for the time being:

 The GNA plugin supports the following configuration parameters for multithreading management:

-* `KEY_GNA_LIB_N_THREADS`
+@sphinxdirective
+.. tab:: C++

-	By default, the GNA plugin uses one worker thread for inference computations. This parameter allows you to create up to 127 threads for software modes.
+   ``KEY_GNA_LIB_N_THREADS``

-> **NOTE:** Multithreading mode does not guarantee the same computation order as the order of issuing. Additionally, in this case, software modes do not implement any serializations.
+.. tab:: Python
+
+   ``GNA_LIB_N_THREADS``
+
+@endsphinxdirective
+
+By default, the GNA plugin uses one worker thread for inference computations. This parameter allows you to create up to 127 threads for software modes.
+
+> **NOTE**: Multithreading mode does not guarantee the same computation order as the order of issuing. Additionally, in this case, software modes do not implement any serializations.

 ## Network Batch Size

 Intel® GNA plugin supports the processing of context-windowed speech frames in batches of 1-8 frames in one
-input blob using `InferenceEngine::ICNNNetwork::setBatchSize`. Increasing batch size only improves efficiency of `Fully Connected` layers.
+input blob using the following methods:
+
+@sphinxdirective
+.. tab:: C++
+
+   ``InferenceEngine::ICNNNetwork::setBatchSize``
+
+.. tab:: Python
+
+   `IENetwork.batch_size <api/ie_python_api/_autosummary/openvino.inference_engine.IENetwork.html#openvino.inference_engine.IENetwork.batch_size>`_
+
+@endsphinxdirective
+
+Increasing batch size only improves efficiency of `Fully Connected` layers.

 > **NOTE**: For networks with `Convolutional`, `LSTM`, or `Memory` layers, the only supported batch size is 1.

@@ -159,7 +435,7 @@ input blob using `InferenceEngine::ICNNNetwork::setBatchSize`. Increasing batch

 Heterogeneous plugin was tested with the Intel® GNA as a primary device and CPU as a secondary device. To run inference of networks with layers unsupported by the GNA plugin, such as Softmax, use the Heterogeneous plugin with the `HETERO:GNA,CPU` configuration.

-> **NOTE:** Due to limitation of the Intel® GNA backend library, heterogenous support is limited to cases where in the resulted sliced graph, only one subgraph is scheduled to run on GNA\_HW or GNA\_SW devices.
+> **NOTE**: Due to limitation of the Intel® GNA backend library, heterogenous support is limited to cases where in the resulted sliced graph, only one subgraph is scheduled to run on GNA\_HW or GNA\_SW devices.

 ## Recovery from Interruption by High-Priority Windows Audio Processes\*

@@ -170,23 +446,65 @@ For such workloads, processing should be time constrained, otherwise extra delay
 the schedule, thereby causing long running GNA tasks to terminate early.

 Applications should be prepared for this situation.
-If an inference in the `GNA_HW` mode cannot be executed because of such an interruption, then `InferRequest::Wait()` returns status code
-`StatusCode::INFER_NOT_STARTED`. In future releases, it will be changed to a more meaningful status code.
+
+If an inference in the `GNA_HW` mode cannot be executed because of such an interruption, then the `wait` method returns the following status code:
+
+@sphinxdirective
+.. tab:: C++
+
+   ``InferRequest::Wait()`` returns status code ``StatusCode::INFER_NOT_STARTED``.
+
+.. tab:: Python
+
+   `InferRequest.wait <api/ie_python_api/_autosummary/openvino.inference_engine.InferRequest.html#openvino.inference_engine.InferRequest.wait>`_ returns status code `INFER_NOT_STARTED`.
+
+@endsphinxdirective
+
+In future releases, it will be changed to a more meaningful status code.

 Any application working with GNA must properly react to this code.
 One of the strategies to adapt an application:

 1. Immediately switch to the GNA_SW emulation mode:
-```cpp
-std::map<std::string, Parameter> newConfig;
-newConfig[GNAConfigParams::KEY_GNA_DEVICE_MODE] = Parameter("GNA_SW_EXACT");
-executableNet.SetConfig(newConfig);
+@sphinxdirective
+.. tab:: C++
+
+   .. code-block:: cpp
+
+      std::map<std::string, Parameter> newConfig;
+      newConfig[GNAConfigParams::KEY_GNA_DEVICE_MODE] = Parameter("GNA_SW_EXACT");
+      executableNet.SetConfig(newConfig);
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      from openvino.inference_engine import IECore
+
+      ie = IECore()
+      new_cfg = {'GNA_DEVICE_MODE' : 'GNA_SW_EXACT'}
+      net = ie.read_network(model=path_to_model)
+      exec_net = ie.load_network(network=net, device_name="GNA", config=new_cfg)
+
+@endsphinxdirective

-```
 2. Resubmit and switch back to GNA_HW expecting that the competing application has finished.

+   > **NOTE**: This method is deprecated since a new automatic QoS mode has been introduced in 2021.4.1 release of OpenVINO™ (see below).
+
+## GNA3 Automatic QoS Feature on Windows*
+
+Starting with 2021.4.1 release of OpenVINO and 03.00.00.1363 version of Windows* GNA driver, a new execution mode `GNA_HW_WITH_SW_FBACK` is introduced
+to assure that workloads satisfy real-time execution. In this mode, the GNA driver automatically falls back on CPU for a particular infer request
+if the HW queue is not empty, so there is no need for explicitly switching between GNA and CPU.
+
+> **NOTE**: Due to the "first come - first served" nature of GNA driver and the QoS feature, this mode may lead to increased CPU consumption
+if there are several clients using GNA simultaneously.
+Even a lightweight competing infer request which has not been cleared at the time when the user's GNA client process makes its request,
+can cause the user's request to be executed on CPU, thereby unnecessarily increasing CPU utilization and power.
+
 ## See Also

 * [Supported Devices](Supported_Devices.md)
 * [Converting Model](../../MO_DG/prepare_model/convert_model/Converting_Model.md)
-* [Convert model from Kaldi](../../MO_DG/prepare_model/convert_model/Convert_Model_From_Kaldi.md)
+* [Convert model from Kaldi](../../MO_DG/prepare_model/convert_model/Convert_Model_From_Kaldi.md)
--- a/docs/IE_DG/supported_plugins/GPU.md
+++ b/docs/IE_DG/supported_plugins/GPU.md
@@ -1,6 +1,17 @@
 GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU}
 =======

+@sphinxdirective
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   
+   openvino_docs_IE_DG_supported_plugins_GPU_RemoteBlob_API
+
+      
+@endsphinxdirective
+
 The GPU plugin uses the Intel® Compute Library for Deep Neural Networks (clDNN) to infer deep neural networks.
 clDNN is an open source performance library for Deep Learning (DL) applications intended for acceleration of Deep Learning Inference on Intel® Processor Graphics including Intel® HD Graphics, Intel® Iris® Graphics, Intel® Iris® Xe Graphics, and Intel® Iris® Xe MAX graphics.
 For an in-depth description of clDNN, see [Inference Engine source files](https://github.com/openvinotoolkit/openvino/tree/master/inference-engine/src/cldnn_engine) and [Accelerate Deep Learning Inference with Intel® Processor Graphics](https://software.intel.com/en-us/articles/accelerating-deep-learning-inference-with-intel-processor-graphics).
@@ -104,18 +115,18 @@ When specifying key values as raw strings (that is, when using Python API), omit
 | `KEY_CACHE_DIR`      | `"<cache_dir>"`                    | `""`              | Specifies a directory where compiled OCL binaries can be cached. First model loading generates the cache, and all subsequent LoadNetwork calls use precompiled kernels which significantly improves load time. If empty - caching is disabled             |
 | `KEY_PERF_COUNT`      | `YES` / `NO`                    | `NO`              | Collect performance counters during inference             |
 | `KEY_CONFIG_FILE`     | `"<file1> [<file2> ...]"`         | `""`              | Load custom layer configuration files                     |
-| `KEY_GPU_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. |
-| `KEY_GPU_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. |
-| `KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS` | `YES` / `NO`                       | `YES`               | Allows using FP16+INT8 mixed precision mode, so non-quantized parts of a model will be executed in FP16 precision for FP16 IR. Does not affect quantized FP32 IRs |
-| `KEY_GPU_NV12_TWO_INPUTS` | `YES` / `NO`                       | `NO`               | Controls preprocessing logic for nv12 input. If it's set to YES, then device graph will expect that user will set biplanar nv12 blob as input wich will be directly passed to device execution graph. Otherwise, preprocessing via GAPI is used to convert NV12->BGR, thus GPU graph have to expect single input |
-| `KEY_GPU_THROUGHPUT_STREAMS`  | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).<br>This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_GPU_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low. <br>The default value is 1, which implies latency-oriented behavior.<br>`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams). <br> A positive integer value creates the requested number of streams. |
-| `KEY_EXCLUSIVE_ASYNC_REQUESTS` | `YES` / `NO`                | `NO`              | Forces async requests (also from different executable networks) to execute serially.|
-| `KEY_GPU_MAX_NUM_THREADS` | `integer value` | `maximum # of HW threads available in host environment` |  Specifies the number of CPU threads that can be used for GPU engine, e.g, JIT compilation of GPU kernels or cpu kernel processing within GPU plugin. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the GPU kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while GPU plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of GPU networks that are optimized with multi-threading. |
-| `KEY_GPU_ENABLE_LOOP_UNROLLING` | `YES` / `NO`             | `YES`             | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. |
-| `KEY_CLDNN_PLUGIN_PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_PRIORITY |
-| `KEY_CLDNN_PLUGIN_THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_THROTTLE |
-| `KEY_CLDNN_GRAPH_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | clDNN graph optimizer stages dump output directory (in GraphViz format) **Deprecated**. Will be removed in the next release                                     |
-| `KEY_CLDNN_SOURCES_DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | Final optimized clDNN OpenCL sources dump output directory. **Deprecated**. Will be removed in the next release                                   |
+| `KEY_GPU_PLUGIN_`<br>`PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. |
+| `KEY_GPU_PLUGIN_`<br>`THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. |
+| `KEY_CLDNN_ENABLE_`<br>`FP16_FOR_QUANTIZED_`<br>`MODELS` | `YES` / `NO`                       | `YES`               | Allows using FP16+INT8 mixed precision mode, so non-quantized parts of a model will be executed in FP16 precision for FP16 IR. Does not affect quantized FP32 IRs |
+| `KEY_GPU_NV12_`<br>`TWO_INPUTS` | `YES` / `NO`                       | `NO`               | Controls preprocessing logic for nv12 input. If it's set to YES, then device graph will expect that user will set biplanar nv12 blob as input wich will be directly passed to device execution graph. Otherwise, preprocessing via GAPI is used to convert NV12->BGR, thus GPU graph have to expect single input |
+| `KEY_GPU_THROUGHPUT_`<br>`STREAMS`  | `KEY_GPU_THROUGHPUT_AUTO`, or positive integer| 1 | Specifies a number of GPU "execution" streams for the throughput mode (upper bound for a number of inference requests that can be executed simultaneously).<br>This option is can be used to decrease GPU stall time by providing more effective load from several streams. Increasing the number of streams usually is more effective for smaller topologies or smaller input sizes. Note that your application should provide enough parallel slack (e.g. running many inference requests) to leverage full GPU bandwidth. Additional streams consume several times more GPU memory, so make sure the system has enough memory available to suit parallel stream execution. Multiple streams might also put additional load on CPU. If CPU load increases, it can be regulated by setting an appropriate `KEY_GPU_PLUGIN_THROTTLE` option value (see above). If your target system has relatively weak CPU, keep throttling low. <br>The default value is 1, which implies latency-oriented behavior.<br>`KEY_GPU_THROUGHPUT_AUTO` creates bare minimum of streams to improve the performance; this is the most portable option if you are not sure how many resources your target machine has (and what would be the optimal number of streams). <br> A positive integer value creates the requested number of streams. |
+| `KEY_EXCLUSIVE_ASYNC_`<br>`REQUESTS` | `YES` / `NO`                | `NO`              | Forces async requests (also from different executable networks) to execute serially.|
+| `KEY_GPU_MAX_NUM_`<br>`THREADS` | `integer value` | `maximum # of HW threads available in host environment` |  Specifies the number of CPU threads that can be used for GPU engine, e.g, JIT compilation of GPU kernels or cpu kernel processing within GPU plugin. The default value is set as the number of maximum available threads in host environment to minimize the time for LoadNetwork, where the GPU kernel build time occupies a large portion. Note that if the specified value is larger than the maximum available # of threads or less than zero, it is set as maximum available # of threads. It can be specified with a smaller number than the available HW threads according to the usage scenario, e.g., when the user wants to assign more CPU threads while GPU plugin is running. Note that setting this value with lower number will affect not only the network loading time but also the cpu layers of GPU networks that are optimized with multi-threading. |
+| `KEY_GPU_ENABLE_`<br>`LOOP_UNROLLING` | `YES` / `NO`             | `YES`             | Enables recurrent layers such as TensorIterator or Loop with fixed iteration count to be unrolled. It is turned on by default. Turning this key on will achieve better inference performance for loops with not too many iteration counts (less than 16, as a rule of thumb). Turning this key off will achieve better performance for both graph loading time and inference time with many iteration counts (greater than 16). Note that turning this key on will increase the graph loading time in proportion to the iteration counts. Thus, this key should be turned off if graph loading time is considered to be most important target to optimize. |
+| `KEY_CLDNN_PLUGIN_`<br>`PRIORITY` | `<0-3>`                       | `0`               | OpenCL queue priority (before usage, make sure your OpenCL driver supports appropriate extension)<br> Higher value means higher priority for OpenCL queue. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_PRIORITY |
+| `KEY_CLDNN_PLUGIN_`<br>`THROTTLE` | `<0-3>`                       | `0`               | OpenCL queue throttling (before usage, make sure your OpenCL driver supports appropriate extension)<br> Lower value means lower driver thread priority and longer sleep time for it. 0 disables the setting. **Deprecated**. Please use KEY_GPU_PLUGIN_THROTTLE |
+| `KEY_CLDNN_GRAPH_`<br>`DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | clDNN graph optimizer stages dump output directory (in GraphViz format) **Deprecated**. Will be removed in the next release                                     |
+| `KEY_CLDNN_SOURCES_`<br>`DUMPS_DIR` | `"<dump_dir>"`                       | `""`               | Final optimized clDNN OpenCL sources dump output directory. **Deprecated**. Will be removed in the next release                                   |
 | `KEY_DUMP_KERNELS`    | `YES` / `NO`                    | `NO`              | Dump the final kernels used for custom layers. **Deprecated**. Will be removed in the next release             |
 | `KEY_TUNING_MODE`     | `TUNING_DISABLED` <br /> `TUNING_CREATE` <br />  `TUNING_USE_EXISTING`            | `TUNING_DISABLED` | Disable inference kernel tuning     <br /> Create tuning file (expect much longer runtime)  <br />         Use an existing tuning file. **Deprecated**. Will be removed in the next release |
 | `KEY_TUNING_FILE`     | `"<filename>"`                  | `""`              | Tuning file to create / use. **Deprecated**. Will be removed in the next release |
--- a/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
+++ b/docs/IE_DG/supported_plugins/GPU_RemoteBlob_API.md
@@ -4,13 +4,13 @@ Remote Blob API of GPU Plugin {#openvino_docs_IE_DG_supported_plugins_GPU_Remote
 The GPU plugin implementation of the `RemoteContext` and `RemoteBlob` interfaces supports GPU 
 pipeline developers who need video memory sharing and interoperability with existing native APIs 
 such as OpenCL\*, Microsoft DirectX\*, or VAAPI\*.
-Using these interfaces allows to avoid any memory copy overhead when plugging the OpenVINO™ inference 
+Using these interfaces allows you to avoid any memory copy overhead when plugging the OpenVINO™ inference 
 into an existing GPU pipeline. It also enables OpenCL kernels participating in the pipeline to become 
 native buffer consumers or producers of the OpenVINO™ inference.
 Since the GPU plugin works on top of the clDNN library, the functionality above is also implemented 
 using OpenCL and its sharing extensions provided by Intel®.

-There are two interoperability scenarios that are supported for the Remote Blob API:
+There are two interoperability scenarios supported by the Remote Blob API:

 * GPU plugin context and memory objects can be constructed from low-level device, display, or memory 
 handles and used to create the OpenVINO™ `ExecutableNetwork` or `Blob` class. 
--- a/docs/IE_DG/supported_plugins/HDDL.md
+++ b/docs/IE_DG/supported_plugins/HDDL.md
@@ -6,27 +6,27 @@ The Inference Engine HDDL plugin was developed for inference with neural network

 ## Configuring the HDDL Plugin

-To configure your Intel® Vision Accelerator Design With Intel® Movidius™ on supported OSs, refer to the Steps for Intel® Vision Accelerator Design with Intel® Movidius™ VPUs section in the installation guides for [Linux](../../install_guides/installing-openvino-linux.md) or [Windows](../../install_guides/installing-openvino-windows.md).
+To configure your Intel® Vision Accelerator Design With Intel® Movidius™ on supported operating systems, refer to the Steps for Intel® Vision Accelerator Design with Intel® Movidius™ VPUs section in the installation guides for [Linux](../../install_guides/installing-openvino-linux.md) or [Windows](../../install_guides/installing-openvino-windows.md).

 ## Supported networks

-For the "Supported Networks", please reference to [MYRIAD Plugin](MYRIAD.md)
+To see the list of supported networks for the HDDL plugin, refer to the list on the [MYRIAD Plugin page](MYRIAD.md).

 ## Supported Configuration Parameters

-See VPU common configuration parameters for the [VPU Plugins](VPU.md).
-When specifying key values as raw strings (that is, when using Python API), omit the `KEY_` prefix.
+See VPU common configuration parameters for [VPU Plugins](VPU.md).
+When specifying key values as raw strings (that is, when using the Python API), omit the `KEY_` prefix.

-In addition to common parameters for MYRIAD plugin and HDDL plugin, HDDL plugin accepts the following options:
+In addition to common parameters for both VPU plugins, the HDDL plugin accepts the following options:

 | Parameter Name                        | Parameter Values | Default      | Description                                                                     |
 | :---                                  | :---             | :---         | :---                                                                            |
-| KEY_PERF_COUNT                        | YES/NO           | NO           | Enable performance counter option.                                               |
+| KEY_PERF_COUNT                        | YES/NO           | NO           | Enable performance counter option.                                              |
 | KEY_VPU_HDDL_GRAPH_TAG                | string           | empty string | Allows to execute network on specified count of devices.                        |
 | KEY_VPU_HDDL_STREAM_ID                | string           | empty string | Allows to execute inference on a specified device.                              |
 | KEY_VPU_HDDL_DEVICE_TAG               | string           | empty string | Allows to allocate/deallocate networks on specified devices.                    |
 | KEY_VPU_HDDL_BIND_DEVICE              | YES/NO           | NO           | Whether the network should bind to a device. Refer to vpu_plugin_config.hpp.    |
-| KEY_VPU_HDDL_RUNTIME_PRIORITY         | signed int       | 0            | Specify the runtime priority of a device among all devices that running a same network Refer to vpu_plugin_config.hpp. |
+| KEY_VPU_HDDL_RUNTIME_PRIORITY         | signed int       | 0            | Specify the runtime priority of a device among all devices running the same network. Refer to vpu_plugin_config.hpp. |

 ## See Also

--- a/docs/IE_DG/supported_plugins/HETERO.md
+++ b/docs/IE_DG/supported_plugins/HETERO.md
@@ -1,37 +1,57 @@
-Heterogeneous Plugin {#openvino_docs_IE_DG_supported_plugins_HETERO}
-=======
+# Heterogeneous Plugin {#openvino_docs_IE_DG_supported_plugins_HETERO}

-## Introducing the Heterogeneous Plugin
+## Introducing the Heterogeneous Plugin (C++)

-The heterogeneous plugin enables computing for inference on one network on several devices.
-The purposes of executing networks in heterogeneous mode:
-* Utilize the power of accelerators to calculate heaviest parts of the network and execute unsupported layers on fallback devices like the CPU 
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective
+
+The heterogeneous plugin enables computing the inference of one network on several devices. The purposes of executing networks in heterogeneous mode are to:
+
+* Utilize the power of accelerators to process the heaviest parts of the network and to execute unsupported layers on fallback devices like the CPU
 * Utilize all available hardware more efficiently during one inference

-The execution through heterogeneous plugin can be divided to two independent steps:
-* Setting of affinity to layers
-* Loading a network to the Heterogeneous plugin, splitting the network to parts, and executing them through the plugin
+The execution through heterogeneous plugin can be divided into two independent steps:

-These steps are decoupled. The setting of affinity can be done automatically using fallback policy or in manual mode.
+1. Setting of hardware affinity to layers
+2. Loading a network to the Heterogeneous plugin, splitting the network to parts, and executing them through the plugin

-The fallback automatic policy causes "greedy" behavior and assigns all layers that can be executed on certain device according to the priorities you specify (for example, `HETERO:GPU,CPU`).
-Automatic policy does not take into account plugin peculiarities such as the inability to infer some layers without other special layers placed before or after that layer. The plugin is responsible for solving such cases. If the device plugin does not support the subgraph topology constructed by the Hetero plugin, then you should set affinity manually.
+These steps are decoupled. The setting of affinity can be done automatically using the fallback policy or in manual mode.

-Some of the topologies are not friendly to heterogeneous execution on some devices or cannot be executed in such mode at all.
-Examples of such networks are networks having activation layers which are not supported on primary device.
-If transmitting data from one part of a network to another part in heterogeneous mode takes more time than in normal mode, it may not make sense to execute them in heterogeneous mode.
-In this case, you can define heaviest part manually and set the affinity to avoid sending data back and forth many times during one inference.
+The fallback automatic policy causes "greedy" behavior and assigns all layers that can be executed on certain device according to the priorities you specify (for example, HETERO:GPU,CPU).
+Automatic policy does not take into account plugin peculiarities such as the inability to infer some layers without other special layers placed before or after that layer. The plugin is responsible for solving such cases. If the device plugin does not support the subgraph topology constructed by the HETERO plugin, then you should set affinity manually.

-## Annotation of Layers per Device and Default Fallback Policy
-Default fallback policy decides which layer goes to which device automatically according to the support in dedicated plugins (FPGA, GPU, CPU, MYRIAD).
+### Details of Splitting Network and Execution
+During loading of the network to the Heterogeneous plugin, the network is divided into separate parts and loaded to dedicated plugins.
+Intermediate blobs between these subgraphs are allocated automatically in the most efficient way.

-Another way to annotate a network is to set affinity manually using <code>ngraph::Node::get_rt_info</code> with key `"affinity"`:
+### Sample Usage
+
+Inference Engine sample programs can use the Heterogeneous plugin used with the `-d` option:
+
+```sh
+./object_detection_sample_ssd -m  <path_to_model>/ModelSSD.xml -i <path_to_pictures>/picture.jpg -d HETERO:GPU,CPU
+```
+where:
+- `HETERO` stands for the Heterogeneous plugin
+- `GPU,CPU` points to fallback policy with priority on GPU and fallback to CPU
+
+You can point more than two devices: `-d HETERO:MYRIAD,GPU,CPU`
+
+
+### Annotation of Layers per Device and Default Fallback Policy
+
+Default fallback policy decides which layer goes to which device automatically according to the support in dedicated plugins (GPU, CPU, MYRIAD).
+
+Another way to annotate a network is to set affinity manually using `ngraph::Node::get_rt_info` with key `affinity`:

@snippet snippets/HETERO0.cpp part0

-The fallback policy does not work if even one layer has an initialized affinity. The sequence should be calling of automating affinity settings and then fix manually.
+The fallback policy does not work if even one layer has an initialized affinity. The sequence should be to call automating affinity settings and then fix manually.

-> **NOTE**: If you set affinity manually, be careful at the current moment Inference Engine plugins don't support constant (`Constant`->`Result`) and empty (`Parameter`->`Result`) networks. Please avoid such subgraphs when you set affinity manually.
+> **NOTE**: If you set affinity manually, be careful because currently Inference Engine plugins don't support constant (`Constant`->`Result`) and empty (`Parameter`->`Result`) networks. Please avoid such subgraphs when you set affinity manually.

@snippet snippets/HETERO1.cpp part1

@@ -39,60 +59,203 @@ If you rely on the default affinity distribution, you can avoid calling <code>In

@snippet snippets/HETERO2.cpp part2

-> **NOTE**: `InferenceEngine::Core::QueryNetwork` does not depend on affinities set by a user, but queries for layer support based on device capabilities.
+> **NOTE**: `InferenceEngine::Core::QueryNetwork` does not depend on affinities set by a user. Instead, it queries for layer support based on device capabilities.
+
+### Handling Difficult Topologies
+
+Some topologies are not friendly to heterogeneous execution on some devices or cannot be executed at all with this plugin
+Examples are networks having activation layers that are not supported on the primary device.
+If transmitting data from one part of a network to another part in heterogeneous mode takes more time than in normal mode, it may not make sense to execute them in heterogeneous mode.
+In this case, you can define the heaviest part manually and set the affinity to avoid sending data back and forth many times during one inference.


-## Details of Splitting Network and Execution
-During loading of the network to heterogeneous plugin, network is divided to separate parts and loaded to dedicated plugins.
-Intermediate blobs between these sub graphs are allocated automatically in the most efficient way.
-
-## Execution Precision
+### Execution Precision
 Precision for inference in heterogeneous plugin is defined by
 * Precision of IR.
 * Ability of final plugins to execute in precision defined in IR

-Examples:
+Example:
 * If you want to execute GPU with CPU fallback with FP16 on GPU, you need to use only FP16 IR.
-* If you want to execute on FPGA with CPU fallback, you can use any precision for IR. The execution on FPGA is defined by bitstream, the execution on CPU happens in FP32.

-Samples can be used with the following command:
+### Analyzing Performance Heterogeneous Execution
+After enabling the <code>KEY_HETERO_DUMP_GRAPH_DOT</code> config key (shown in code snippet below), you can dump GraphViz* `.dot` files with annotations of devices per layer.

-```sh
-./object_detection_sample_ssd -m  <path_to_model>/ModelSSD.xml -i <path_to_pictures>/picture.jpg -d HETERO:FPGA,CPU
-```
-where:
- `HETERO` stands for heterogeneous plugin
- `FPGA,CPU` points to fallback policy with priority on FPGA and fallback to CPU
+The Heterogeneous plugin can generate two files:

-You can point more than two devices: `-d HETERO:FPGA,GPU,CPU`
-
-## Analyzing Heterogeneous Execution
-After enabling of <code>KEY_HETERO_DUMP_GRAPH_DOT</code> config key, you can dump GraphViz* `.dot` files with annotations of devices per layer.
-
-Heterogeneous plugin can generate two files:
 * `hetero_affinity_<network name>.dot` - annotation of affinities per layer. This file is written to the disk only if default fallback policy was executed
-* `hetero_subgraphs_<network name>.dot` - annotation of affinities per graph. This file is written to the disk during execution of <code>ICNNNetwork::LoadNetwork()</code> for heterogeneous plugin
+* `hetero_subgraphs_<network name>.dot` - annotation of affinities per graph. This file is written to the disk during execution of `ICNNNetwork::LoadNetwork()` for the Heterogeneous plugin

@snippet snippets/HETERO3.cpp part3

-You can use GraphViz* utility or converters to `.png` formats. On Ubuntu* operating system, you can use the following utilities:
+You can use the GraphViz* utility or a file converter to view the images. On the Ubuntu* operating system, you can use xdot:
+
 * `sudo apt-get install xdot`
 * `xdot hetero_subgraphs.dot`

+You can use performance data (in sample applications, it is the option `-pc`) to get the performance data on each subgraph.

-You can use performance data (in samples, it is an option `-pc`) to get performance data on each subgraph.
+Here is an example of the output for Googlenet v1 running on HDDL with fallback to CPU:

-Here is an example of the output: for Googlenet v1 running on FPGA with fallback to CPU:
-```cpp
-subgraph1: 1. input preprocessing (mean data/FPGA):EXECUTED       layerType:                    realTime: 129        cpu: 129            execType:
-subgraph1: 2. input transfer to DDR:EXECUTED       layerType:                    realTime: 201        cpu: 0              execType:
-subgraph1: 3. FPGA execute time:EXECUTED       layerType:                    realTime: 3808       cpu: 0              execType:
-subgraph1: 4. output transfer from DDR:EXECUTED       layerType:                    realTime: 55         cpu: 0              execType:
-subgraph1: 5. FPGA output postprocessing:EXECUTED       layerType:                    realTime: 7          cpu: 7              execType:
-subgraph1: 6. copy to IE blob:EXECUTED       layerType:                    realTime: 2          cpu: 2              execType:
-subgraph2: out_prob:          NOT_RUN        layerType: Output             realTime: 0          cpu: 0              execType: unknown
-subgraph2: prob:              EXECUTED       layerType: SoftMax            realTime: 10         cpu: 10             execType: ref
-Total time: 4212     microseconds
 ```
-## See Also
-* [Supported Devices](Supported_Devices.md)
+subgraph1: 1. input preprocessing (mean data/HDDL):EXECUTED layerType:          realTime: 129   cpu: 129  execType:
+subgraph1: 2. input transfer to DDR:EXECUTED                layerType:          realTime: 201   cpu: 0    execType:
+subgraph1: 3. HDDL execute time:EXECUTED                    layerType:          realTime: 3808  cpu: 0    execType:
+subgraph1: 4. output transfer from DDR:EXECUTED             layerType:          realTime: 55    cpu: 0    execType:
+subgraph1: 5. HDDL output postprocessing:EXECUTED           layerType:          realTime: 7     cpu: 7    execType:
+subgraph1: 6. copy to IE blob:EXECUTED                      layerType:          realTime: 2     cpu: 2    execType:
+subgraph2: out_prob:          NOT_RUN                       layerType: Output   realTime: 0     cpu: 0    execType: unknown
+subgraph2: prob:              EXECUTED                      layerType: SoftMax  realTime: 10    cpu: 10   execType: ref
+Total time: 4212 microseconds
+```
+### See Also
+[Supported Devices](Supported_Devices.md)
+
+## Introducing the Heterogeneous Plugin (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+The heterogeneous plugin enables computing the inference of one network on several devices. The purposes of executing networks in heterogeneous mode are to:
+
+* Utilize the power of accelerators to process the heaviest parts of the network and to execute unsupported layers on fallback devices like the CPU
+* Utilize all available hardware more efficiently during one inference
+
+The execution through heterogeneous plugin can be divided into two independent steps:
+
+1. Setting of hardware affinity to layers
+2. Loading a network to the Heterogeneous plugin, splitting the network to parts, and executing them through the plugin
+
+These steps are decoupled. The setting of affinity can be done automatically using the fallback policy or in manual mode.
+
+The fallback automatic policy causes "greedy" behavior and assigns all layers that can be executed on certain device according to the priorities you specify (for example, HETERO:GPU,CPU).
+Automatic policy does not take into account plugin peculiarities such as the inability to infer some layers without other special layers placed before or after that layer. The plugin is responsible for solving such cases. If the device plugin does not support the subgraph topology constructed by the HETERO plugin, then you should set affinity manually.
+
+Some of the topologies are not well-supported for heterogeneous execution on some devices or cannot be executed in this mode at all. Examples of such networks are those having activation layers which are not supported on the primary device. If transmitting data from one part of a network to another part in heterogeneous mode takes more time than in normal mode, it may not make sense to execute them in heterogeneous mode. In this case, you can define the most compute intense part manually and set the affinity to avoid sending data back and forth many times during one inference.
+
+### Use Default Layer Affinities
+
+To use the default affinities, call `load_network` with the "HETERO" device, with an optional list of devices to consider.
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_model)
+exec_net = ie.load_network(network=net, device_name='HETERO:GPU,CPU')
+```
+
+
+### Annotation of Layers per Device and Default Fallback Policy
+
+Default fallback policy decides which layer goes to which device automatically according to the support in dedicated plugins (GPU, CPU, MYRIAD).
+
+Another way to annotate a network is to set affinity manually using code.
+
+### Set Affinity of All Layers to CPU
+```python
+import ngraph as ng
+from openvino.inference_engine import IECore
+
+ie = IECore()
+# Read a network in IR or ONNX format
+net = ie.read_network(path_to_model)
+# Create an Ngraph (graph) function from the network
+ng_func = ng.function_from_cnn(net)
+for node in ng_func.get_ordered_ops():
+    rt_info = node.get_rt_info()
+    rt_info["affinity"] = "CPU"
+```
+
+
+The fallback policy does not work if even one layer has an initialized affinity. The sequence should be calling the default affinity settings and then setting the layers manually.
+
+> **NOTE**: If you set affinity manually, be aware that currently Inference Engine plugins do not support constant (*Constant -> Result*) and empty (*Parameter -> Result*) networks. Please avoid these subgraphs when you set affinity manually.
+
+### Example - Manually Setting Layer Affinities
+
+```python
+import ngraph as ng
+from openvino.inference_engine import IECore
+
+ie = IECore()
+# Read a network in IR or ONNX format
+net = ie.read_network(path_to_model)
+ng_func = ng.function_from_cnn(net)
+
+for node in ng_func.get_ordered_ops():
+    rt_info = node.get_rt_info()
+    rt_info["affinity"] = "CPU"
+
+# Load the network on the target device
+exec_net = ie.load_network(network=net, device_name='HETERO:FPGA,CPU')
+```
+
+> **NOTE**: `ie.query_network` does not depend on affinities set by a user, but queries for layer support based on device capabilities.
+
+### Details of Splitting Network and Execution
+
+During the loading of the network to the heterogeneous plugin, the network is divided into separate parts and loaded to dedicated plugins. Intermediate blobs between these sub graphs are allocated automatically in the most efficient way.
+
+### Execution Precision
+
+The precision for inference in the heterogeneous plugin is defined by:
+
+* Precision of IR
+* Ability of final plugins to execute in precision defined in IR
+
+Example:
+
+* If you want to execute GPU with CPU fallback with FP16 on GPU, you need to use only FP16 IR.
+
+OpenVINO samples can be used with the following command:
+```sh
+./object_detection_sample_ssd -m  <path_to_model>/ModelSSD.xml -i <path_to_pictures>/picture.jpg -d HETERO:MYRIAD,CPU
+```
+
+where HETERO stands for the heterogeneous plugin
+
+You can point to more than two devices, for example: `-d HETERO:MYRIAD,GPU,CPU`
+
+### Analyzing Heterogeneous Execution
+
+After enabling the KEY_HETERO_DUMP_GRAPH_DOT config key, you can dump GraphViz* .dot files with annotations of devices per layer.
+
+The heterogeneous plugin can generate two files:
+
+* `hetero_affinity_<network name>.dot` - annotation of affinities per layer. This file is written to the disk only if the default fallback policy was executed
+* `hetero_subgraphs_<network name>.dot` - annotation of affinities per graph. This file is written to the disk during execution of `ICNNNetwork::LoadNetwork()` for the heterogeneous plugin
+
+#### To Generate the .dot Files
+
+```python
+ie = IECore()
+ie.set_config( config={'HETERO_DUMP_GRAPH_DOT' : 'YES'}, device_name='HETERO')
+```
+
+You can use the GraphViz* utility or a file converter to view the images. On the Ubuntu* operating system, you can use xdot:
+
+* `sudo apt-get install xdot`
+* `xdot hetero_subgraphs.dot`
+
+You can use performance data (in sample applications, it is the option `-pc`) to get the performance data on each subgraph.
+
+Here is an example of the output for Googlenet v1 running on HDDL with fallback to CPU:
+
+```
+subgraph1: 1. input preprocessing (mean data/HDDL):EXECUTED layerType:          realTime: 129   cpu: 129  execType:
+subgraph1: 2. input transfer to DDR:EXECUTED                layerType:          realTime: 201   cpu: 0    execType:
+subgraph1: 3. HDDL execute time:EXECUTED                    layerType:          realTime: 3808  cpu: 0    execType:
+subgraph1: 4. output transfer from DDR:EXECUTED             layerType:          realTime: 55    cpu: 0    execType:
+subgraph1: 5. HDDL output postprocessing:EXECUTED           layerType:          realTime: 7     cpu: 7    execType:
+subgraph1: 6. copy to IE blob:EXECUTED                      layerType:          realTime: 2     cpu: 2    execType:
+subgraph2: out_prob:          NOT_RUN                       layerType: Output   realTime: 0     cpu: 0    execType: unknown
+subgraph2: prob:              EXECUTED                      layerType: SoftMax  realTime: 10    cpu: 10   execType: ref
+Total time: 4212 microseconds
+```
+
+
+### See Also
+[Supported Devices](Supported_Devices.md)
--- a/docs/IE_DG/supported_plugins/MULTI.md
+++ b/docs/IE_DG/supported_plugins/MULTI.md
@@ -1,31 +1,37 @@
 # Multi-Device Plugin {#openvino_docs_IE_DG_supported_plugins_MULTI}

-## Introducing the Multi-Device Plugin
+## Introducing the Multi-Device Plugin (C++)

-The Multi-Device plugin automatically assigns inference requests to available computational devices to execute the requests in parallel. Potential gains are as follows:
-* Improved throughput that multiple devices can deliver (compared to single-device execution)
-* More consistent performance, since the devices can now share the inference burden
-(so that if one device is becoming too busy, another device can take more of the load)
+@sphinxdirective
+.. raw:: html

-Notice that with multi-device the application logic is left unchanged, so you don't need to explicitly load the network to every device, create and balance the inference requests and so on. From the application point of view, this is just another device that handles the actual machinery. 
-The only thing that is required to leverage performance is to provide the multi-device (and hence the underlying devices) with enough inference requests to crunch.
-For example, if you were processing 4 cameras on the CPU (with 4 inference requests), you may now want to process more cameras (with more requests in flight) to keep CPU+GPU busy via multi-device.
+    <div id="switcher-cpp" class="switcher-anchor">C++</div>
+@endsphinxdirective

-The "setup" of Multi-Device can be described in three major steps:
-* First is configuration of each device as usual (e.g. via conventional SetConfig method)
-* Second is loading of a network to the Multi-Device plugin created on top of (prioritized) list of the  configured devices. This is the only change that you need in your application.
-* Finally, just like with any other ExecutableNetwork (resulted from LoadNetwork) you just create as many requests as needed to saturate the devices.
-These steps are covered below in details.
+The Multi-Device plugin automatically assigns inference requests to available computational devices to execute the requests in parallel. By contrast, the Heterogeneous plugin can run different layers on different devices but not in parallel. The potential gains with the Multi-Device plugin are:

-## Defining and Configuring the Multi-Device plugin
-Following the OpenVINO notions of "devices", the Multi-Device has a "MULTI" name.
-The only configuration option for the Multi-Device plugin is a prioritized list of devices to use:
+* Improved throughput from using multiple devices (compared to single-device execution)
+* More consistent performance, since the devices share the inference burden (if one device is too busy, another can take more of the load)

-| Parameter name                 | Parameter values      | Default            | Description                                                                                                                  |
-| :---                      | :---                  | :---               | :----------------------------------------------------------------------------------------------------------------------------|
-| "MULTI_DEVICE_PRIORITIES"  | comma-separated device names <span style="color:red">with no spaces</span>| N/A              | Prioritized list of devices                 |
+Note that with Multi-Device the application logic is left unchanged, so you don't need to explicitly load the network to every device, create and balance the inference requests and so on. From the application point of view, this is just another device that handles the actual machinery. The only thing that is required to leverage performance is to provide the multi-device (and hence the underlying devices) with enough inference requests to process. For example, if you were processing 4 cameras on the CPU (with 4 inference requests), it might be desirable to process more cameras (with more requests in flight) to keep CPU and GPU busy via Multi-Device.

-You can use name of the configuration directly as a string, or use `MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES from the multi/multi_device_config.hpp`, which defines the same string.
+The setup of Multi-Device can be described in three major steps:
+
+1. Configure each device as usual.
+2. Load the network to the Multi-Device plugin created on top of a (prioritized) list of the configured devices. This is the only change needed in the application.
+3. As with any other ExecutableNetwork call (resulting from `InferenceEngine::Core::LoadNetwork`), you create as many requests as needed to saturate the devices.
+
+These steps are covered below in detail.
+
+### Defining and Configuring the Multi-Device Plugin
+
+Following the OpenVINO™ convention of labeling devices, the Multi-Device plugin uses the name "MULTI". The only configuration option for the Multi-Device plugin is a prioritized list of devices to use:
+
+| Parameter name | Parameter values | Default | Description |
+| -------------- | ---------------- | --- | --- |
+| "MULTI_DEVICE_PRIORITIES" | comma-separated device names with no spaces | N/A | Prioritized list of devices |
+
+You can set the configuration directly as a string, or use the metric key `MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES from the `multi/multi_device_config.hpp` file, which defines the same string.
 
 Basically, there are three ways to specify the devices to be use by the "MULTI":

@@ -35,73 +41,252 @@ Notice that the priorities of the devices can be changed in real time for the ex

@snippet snippets/MULTI1.cpp part1

-Finally, there is a way to specify number of requests that the multi-device will internally keep for each device. Suppose your original app was running 4 cameras with 4 inference requests. You would probably want to share these 4 requests between 2 devices used in the MULTI. The easiest way is to specify a number of requests for each device using parentheses: "MULTI:CPU(2),GPU(2)" and use the same 4 requests in your app. However, such an explicit configuration is not performance-portable and hence not recommended. Instead, the better way is to configure the individual devices and query the resulting number of requests to be used at the application level (see [Configuring the Individual Devices and Creating the Multi-Device On Top](#configuring-the-individual-devices-and-creating-the-multi-device-on-top)).
+Finally, there is a way to specify number of requests that the Multi-Device will internally keep for each device. Suppose your original app was running 4 cameras with 4 inference requests. You would probably want to share these 4 requests between 2 devices used in MULTI. The easiest way is to specify a number of requests for each device using parentheses: "MULTI:CPU(2),GPU(2)" and use the same 4 requests in your app. However, such an explicit configuration is not performance-portable and hence not recommended. Instead, the better way is to configure the individual devices and query the resulting number of requests to be used at the application level (see [Configuring the Individual Devices and Creating the Multi-Device On Top](#configuring-the-individual-devices-and-creating-the-multi-device-on-top)).

-## Enumerating Available Devices
-Inference Engine now features a dedicated API to enumerate devices and their capabilities. See [Hello Query Device C++ Sample](../../../inference-engine/samples/hello_query_device/README.md).  This is example output from the sample (truncated to the devices' names only):
+### Enumerating Available Devices
+The Inference Engine features a dedicated API to enumerate devices and their capabilities. See the [Hello Query Device C++ Sample](../../../inference-engine/samples/hello_query_device/README.md). This is example output from the sample (truncated to device names only):

 ```sh
-./hello_query_device
-Available devices: 
-    Device: CPU
-...
-    Device: GPU.0
-...
-    Device: GPU.1
-...
-    Device: HDDL
+  ./hello_query_device
+  Available devices:
+      Device: CPU
+  ...
+      Device: GPU.0
+  ...
+      Device: GPU.1
+  ...
+      Device: HDDL
 ```
+
 A simple programmatic way to enumerate the devices and use with the multi-device is as follows:

@snippet snippets/MULTI2.cpp part2

-Beyond the trivial "CPU", "GPU", "HDDL" and so on, when multiple instances of a device are available the names are more qualified.
-For example, this is how two Intel® Movidius™ Myriad™ X sticks are listed with the hello_query_sample:
+Beyond the trivial "CPU", "GPU", "HDDL" and so on, when multiple instances of a device are available the names are more qualified. For example, this is how two Intel® Movidius™ Myriad™ X sticks are listed with the hello_query_sample:
 ```
 ...
    Device: MYRIAD.1.2-ma2480
 ...
    Device: MYRIAD.1.4-ma2480
 ```
-So the explicit configuration to use both would be "MULTI:MYRIAD.1.2-ma2480,MYRIAD.1.4-ma2480".
-Accordingly, the code that loops over all available devices of "MYRIAD" type only is below:
+
+So the explicit configuration to use both would be "MULTI:MYRIAD.1.2-ma2480,MYRIAD.1.4-ma2480". Accordingly, the code that loops over all available devices of "MYRIAD" type only is below:

@snippet snippets/MULTI3.cpp part3


-## Configuring the Individual Devices and Creating the Multi-Device On Top
+### Configuring the Individual Devices and Creating the Multi-Device On Top
 As discussed in the first section, you shall configure each individual device as usual and then just create the "MULTI" device on top:

@snippet snippets/MULTI4.cpp part4

-Alternatively, you can combine all the individual device settings into single config and load that, allowing the Multi-Device plugin to parse and apply that to the right devices. See code example in the next section.
+An alternative is to combine all the individual device settings into a single config file and load that, allowing the Multi-Device plugin to parse and apply settings to the right devices. See the code example in the next section.

-Notice that while the performance of accelerators combines really well with multi-device, the CPU+GPU execution poses some performance caveats, as these devices share the power, bandwidth and other resources. For example it is recommended to enable the GPU throttling hint (which save another CPU thread for the CPU inference).
-See section of the [Using the multi-device with OpenVINO samples and benchmarking the performance](#using-the-multi-device-with-openvino-samples-and-benchmarking-the-performance) below.
+Note that while the performance of accelerators combines really well with Multi-Device, the CPU+GPU execution poses some performance caveats, as these devices share the power, bandwidth and other resources. For example it is recommended to enable the GPU throttling hint (which save another CPU thread for the CPU inference).
+See the [Using the Multi-Device with OpenVINO samples and benchmarking the performance](#using-the-multi-device-with-openvino-samples-and-benchmarking-the-performance) section below.

-## Querying the Optimal Number of Inference Requests
-Notice that until R2 you had to calculate number of requests in your application for any device, e.g. you had to know that Intel® Vision Accelerator Design with Intel® Movidius™ VPUs required at least 32 inference requests to perform well. Now you can use the new GetMetric API to query the optimal number of requests. Similarly, when using the multi-device you don't need to sum over included devices yourself, you can query metric directly:
+### Querying the Optimal Number of Inference Requests
+You can use the new GetMetric API to query the optimal number of requests. Similarly, when using the Multi-Device you don't need to sum over included devices yourself, you can query metric directly:

@snippet snippets/MULTI5.cpp part5

-## Using the Multi-Device with OpenVINO Samples and Benchmarking the Performance
-Notice that every OpenVINO sample that supports "-d" (which stands for "device") command-line option transparently accepts the multi-device.
-The [Benchmark Application](../../../inference-engine/samples/benchmark_app/README.md) is the best reference to the optimal usage of the multi-device. As discussed multiple times earlier, you don't need to setup number of requests, CPU streams or threads as the application provides optimal out of the box performance.
-Below is example command-line to evaluate HDDL+GPU performance with that:
+### Using the Multi-Device with OpenVINO Samples and Benchmarking the Performance
+
+Every OpenVINO sample that supports the `-d` (which stands for "device") command-line option transparently accepts Multi-Device. The [Benchmark Application](../../../inference-engine/samples/benchmark_app/README.md) is the best reference for the optimal usage of Multi-Device. As discussed earlier, you do not need to set up the number of requests, CPU streams or threads because the application provides optimal performance out of the box. Below is an example command to evaluate HDDL+GPU performance with that:

 ```sh
 ./benchmark_app –d MULTI:HDDL,GPU –m <model> -i <input> -niter 1000
 ```
-Notice that you can use the FP16 IR to work with multi-device (as CPU automatically upconverts it to the fp32) and rest of devices support it naturally. 
-Also notice that no demos are (yet) fully optimized for the multi-device, by means of supporting the OPTIMAL_NUMBER_OF_INFER_REQUESTS metric, using the GPU streams/throttling, and so on.

-## Video: MULTI Plugin
-[![](https://img.youtube.com/vi/xbORYFEmrqU/0.jpg)](https://www.youtube.com/watch?v=xbORYFEmrqU)
-\htmlonly
-<iframe width="560" height="315" src="https://www.youtube.com/embed/xbORYFEmrqU" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-\endhtmlonly
+The Multi-Device plugin supports FP16 IR files. The CPU plugin automatically upconverts it to FP32 and the other devices support it natively. Note that no demos are (yet) fully optimized for Multi-Device, by means of supporting the OPTIMAL_NUMBER_OF_INFER_REQUESTS metric, using the GPU streams/throttling, and so on.

-## See Also
-* [Supported Devices](Supported_Devices.md)
+### Video: MULTI Plugin
+
+@sphinxdirective
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/xbORYFEmrqU" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+@endsphinxdirective
+
+### See Also
+[Supported Devices](Supported_Devices.md)
+
+## Introducing the Multi-Device Plugin (Python)
+
+@sphinxdirective
+.. raw:: html
+
+    <div id="switcher-python" class="switcher-anchor">Python</div>
+@endsphinxdirective
+
+The Multi-Device plugin automatically assigns inference requests to available computational devices to execute the requests in parallel. By contrast, the Heterogeneous plugin can run different layers on different devices but not in parallel. The potential gains with the Multi-Device plugin are:
+
+* Improved throughput from using multiple devices (compared to single-device execution)
+* More consistent performance, since the devices share the inference burden (if one device is too busy, another can take more of the load)
+
+Note that with Multi-Device the application logic is left unchanged, so you don't need to explicitly load the network to every device, create and balance the inference requests and so on. From the application point of view, this is just another device that handles the actual machinery. The only thing that is required to leverage performance is to provide the multi-device (and hence the underlying devices) with enough inference requests to process. For example, if you were processing 4 cameras on the CPU (with 4 inference requests), it might be desirable to process more cameras (with more requests in flight) to keep CPU and GPU busy via Multi-Device.
+
+The setup of Multi-Device can be described in three major steps:
+
+1. Configure each device as usual (using the conventional [ie_api.IECore.set_config](api/ie_python_api/_autosummary/openvino.inference_engine.IECore.html#openvino.inference_engine.IECore.set_config) method
+2. Load the network to the Multi-Device plugin created on top of a (prioritized) list of the configured devices. This is the only change needed in the application.
+3. As with any other ExecutableNetwork call (resulting from `load_network`), you create as many requests as needed to saturate the devices.
+
+These steps are covered below in detail.
+
+### Defining and Configuring the Multi-Device Plugin
+
+Following the OpenVINO™ convention of labeling devices, the Multi-Device plugin uses the name "MULTI". The only configuration option for the Multi-Device plugin is a prioritized list of devices to use:
+
+| Parameter name | Parameter values | Default | Description |
+| -------------- | ---------------- | --- | --- |
+| "MULTI_DEVICE_PRIORITIES" | comma-separated device names with no spaces | N/A | Prioritized list of devices |
+
+You can set the configuration directly as a string, or use the metric key `MULTI_DEVICE_PRIORITIES` from the `multi/multi_device_config.hpp` file, which defines the same string.
+
+#### The Three Ways to Specify Devices Targets for the MULTI plugin
+
+* Option 1 - Pass a Prioritized List as a Parameter in ie.load_network()
+   ```python
+   from openvino.inference_engine import IECore
+
+   ie = IECore()
+   # Read a network in IR or ONNX format
+   net = ie.read_network(model=path_to_model)
+   exec_net = ie.load_network(network=net, device_name="MULTI:CPU,GPU")
+   ```
+
+* Option 2 - Pass a List as a Parameter, and Dynamically Change Priorities during Execution
+   Notice that the priorities of the devices can be changed in real time for the executable network:
+   ```python
+   from openvino.inference_engine import IECore
+
+   # Init the Inference Engine Core
+   ie = IECore()
+
+   # Read a network in IR or ONNX format
+   net = ie.read_network(model=path_to_model)
+
+   ie.set_config( config={"MULTI_DEVICE_PRIORITIES":"HDDL,GPU"}, device_name="MULTI")
+
+   # Change priorities
+   ie.set_config( config={"MULTI_DEVICE_PRIORITIES":"GPU,HDDL"}, device_name="MULTI")
+   ie.set_config( config={"MULTI_DEVICE_PRIORITIES":"GPU"}, device_name="MULTI")
+   ie.set_config( config={"MULTI_DEVICE_PRIORITIES":"HDDL,GPU"}, device_name="MULTI")
+   ie.set_config( config={"MULTI_DEVICE_PRIORITIES":"CPU,HDDL,GPU"}, device_name="MULTI")
+   ```
+
+* Option 3 - Use Explicit Hints for Controlling Request Numbers Executed by Devices
+   There is a way to specify the number of requests that Multi-Device will internally keep for each device. If the original app was running 4 cameras with 4 inference requests, it might be best to share these 4 requests between 2 devices used in the MULTI. The easiest way is to specify a number of requests for each device using parentheses: “MULTI:CPU(2),GPU(2)” and use the same 4 requests in the app. However, such an explicit configuration is not performance-portable and not recommended. The better way is to configure the individual devices and query the resulting number of requests to be used at the application level. See [Configuring the Individual Devices and Creating the Multi-Device On Top](#configuring-the-individual-devices-and-creating-the-multi-device-on-top).


+### Enumerating Available Devices
+The Inference Engine features a dedicated API to enumerate devices and their capabilities. See the [Hello Query Device Python Sample](../../../inference-engine/ie_bridges/python/sample/hello_query_device/README.md). This is example output from the sample (truncated to device names only):
+
+```sh
+  ./hello_query_device
+  Available devices:
+      Device: CPU
+  ...
+      Device: GPU.0
+  ...
+      Device: GPU.1
+  ...
+      Device: HDDL
+```
+
+A simple programmatic way to enumerate the devices and use with the multi-device is as follows:
+
+```python
+
+from openvino.inference_engine import IECore
+
+all_devices = "MULTI:"
+ie = IECore()
+net = ie.read_network(model=path_to_model)
+all_devices += ",".join(ie.available_devices)
+exec_net = ie.load_network(network=net, device_name=all_devices)
+```
+
+Beyond the trivial "CPU", "GPU", "HDDL" and so on, when multiple instances of a device are available the names are more qualified. For example, this is how two Intel® Movidius™ Myriad™ X sticks are listed with the hello_query_sample:
+
+```bash
+  ...
+      Device: MYRIAD.1.2-ma2480
+  ...
+      Device: MYRIAD.1.4-ma2480
+```
+
+So the explicit configuration to use both would be "MULTI:MYRIAD.1.2-ma2480,MYRIAD.1.4-ma2480". Accordingly, the code that loops over all available devices of "MYRIAD" type only is below:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+match_list = []
+all_devices = "MULTI:"
+dev_match_str = "MYRIAD"
+net = ie.read_network(model=path_to_model)
+
+for d in ie.available_devices:
+    if dev_match_str in d:
+        match_list.append(d)
+
+all_devices += ",".join(match_list)
+exec_net = ie.load_network(network=net, device_name=all_devices)
+```
+
+### Configuring the Individual Devices and Creating the Multi-Device On Top
+
+It is possible to configure each individual device as usual and then create the "MULTI" device on top:
+
+```python
+from openvino.inference_engine import IECore
+
+ie = IECore()
+net = ie.read_network(model=path_to_model)
+
+cpu_config = {}
+gpu_config = {}
+
+ie.set_config(config=cpu_config, device_name="CPU")
+ie.set_config(config=gpu_config, device_name="GPU")
+
+# Load the network to the multi-device, specifying the priorities
+exec_net = ie.load_network(
+    network=net, device_name="MULTI", config={"MULTI_DEVICE_PRIORITIES": "CPU,GPU"}
+)
+# Query the optimal number of requests
+nireq = exec_net.get_metric("OPTIMAL_NUMBER_OF_INFER_REQUESTS")
+```
+
+An alternative is to combine all the individual device settings into a single config file and load that, allowing the Multi-Device plugin to parse and apply settings to the right devices. See the code example in the next section.
+
+Note that while the performance of accelerators works well with Multi-Device, the CPU+GPU execution poses some performance caveats, as these devices share power, bandwidth and other resources. For example it is recommended to enable the GPU throttling hint (which saves another CPU thread for CPU inferencing). See the section below titled Using the Multi-Device with OpenVINO Samples and Benchmarking the Performance.
+
+
+### Using the Multi-Device with OpenVINO Samples and Benchmarking the Performance
+
+Every OpenVINO sample that supports the `-d` (which stands for "device") command-line option transparently accepts Multi-Device. The [Benchmark application](../../../inference-engine/tools/benchmark_tool/README.md) is the best reference for the optimal usage of Multi-Device. As discussed earlier, you do not need to set up the number of requests, CPU streams or threads because the application provides optimal performance out of the box. Below is an example command to evaluate CPU+GPU performance with the Benchmark application:
+
+```sh
+./benchmark_app.py –d MULTI:CPU,GPU –m <model>
+```
+
+> **NOTE**: If you installed OpenVINO with pip, use `benchmark_app -d MULTI:CPU,GPU -m <model>`
+
+The Multi-Device plugin supports FP16 IR files. The CPU plugin automatically upconverts it to FP32 and the other devices support it natively. Note that no demos are (yet) fully optimized for Multi-Device, by means of supporting the OPTIMAL_NUMBER_OF_INFER_REQUESTS metric, using the GPU streams/throttling, and so on.
+
+### Video: MULTI Plugin
+> **NOTE**: This video is currently available only for C++, but many of the same concepts apply to Python.
+
+@sphinxdirective
+.. raw:: html
+
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/xbORYFEmrqU" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+
+@endsphinxdirective
+
+### See Also
+[Supported Devices](Supported_Devices.md)
--- a/docs/IE_DG/supported_plugins/MYRIAD.md
+++ b/docs/IE_DG/supported_plugins/MYRIAD.md
@@ -4,63 +4,14 @@

 The Inference Engine MYRIAD plugin has been developed for inference of neural networks on Intel&reg; Neural Compute Stick 2.

-## Installation on Linux* OS
+## Configuring the MYRIAD Plugin

-For installation instructions, refer to the [Installation Guide for Linux*](../../install_guides/installing-openvino-linux.md).
-
-
-## Installation on Windows* OS
-
-For installation instructions, refer to the [Installation Guide for Windows*](../../install_guides/installing-openvino-windows.md).
-
-## Supported networks
-
-The Inference Engine MYRIAD plugin supports the following networks:
-
-**Caffe\***:
-* AlexNet
-* CaffeNet
-* GoogleNet (Inception) v1, v2, v4
-* VGG family (VGG16, VGG19)
-* SqueezeNet v1.0, v1.1
-* ResNet v1 family (18\*\*\*, 50, 101, 152)
-* MobileNet (mobilenet-v1-1.0-224, mobilenet-v2)
-* Inception ResNet v2
-* DenseNet family (121,161,169,201)
-* SSD-300, SSD-512, SSD-MobileNet, SSD-GoogleNet, SSD-SqueezeNet
-
-**TensorFlow\***:
-* AlexNet
-* Inception v1, v2, v3, v4
-* Inception ResNet v2
-* MobileNet v1, v2
-* ResNet v1 family (50, 101, 152)
-* ResNet v2 family (50, 101, 152)
-* SqueezeNet v1.0, v1.1
-* VGG family (VGG16, VGG19)
-* Yolo family (yolo-v2, yolo-v3, tiny-yolo-v1, tiny-yolo-v2, tiny-yolo-v3)
-* faster_rcnn_inception_v2, faster_rcnn_resnet101
-* ssd_mobilenet_v1
-* DeepLab-v3+
-
-**MXNet\***:
-* AlexNet and CaffeNet
-* DenseNet family (121,161,169,201)
-* SqueezeNet v1.1
-* MobileNet v1, v2
-* NiN
-* ResNet v1 (101, 152)
-* ResNet v2 (101)
-* SqueezeNet v1.1
-* VGG family (VGG16, VGG19)
-* SSD-Inception-v3, SSD-MobileNet, SSD-ResNet-50, SSD-300
-
-\*\*\* Network is tested on Intel&reg; Neural Compute Stick 2 with BatchNormalization fusion optimization disabled during Model Optimizer import
+To configure your Intel® Vision Accelerator Design With Intel® Movidius™ on supported operating systemss, refer to the Steps for Intel® Vision Accelerator Design with Intel® Movidius™ VPUs section in the installation guides for [Linux](../../install_guides/installing-openvino-linux.md) or [Windows](../../install_guides/installing-openvino-windows.md).

 ## Supported Configuration Parameters

 See VPU common configuration parameters for the [VPU Plugins](VPU.md).
-When specifying key values as raw strings (that is, when using Python API), omit the `KEY_` prefix.
+When specifying key values as raw strings (that is, when using the Python API), omit the `KEY_` prefix.

 In addition to common parameters, the MYRIAD plugin accepts the following options:

@@ -75,9 +26,9 @@ In addition to common parameters, the MYRIAD plugin accepts the following option
 ## Device allocation <a name="MYRIAD_DEVICE_ALLOC">&nbsp;</a>

 Each `IExecutableNetwork` instance tries to allocate new device on `InferenceEngine::Core::LoadNetwork`, but if all available devices are already allocated it will use the one with the minimal number of uploaded networks.
-The maximum number of networks single device can handle depends on device memory capacity and the size of the networks.
+The maximum number of networks a single device can handle depends on device memory capacity and the size of the networks.

-If `KEY_VPU_MYRIAD_FORCE_RESET` option is set to `YES` the plugin will reset all VPU devices in the system.
+If the `KEY_VPU_MYRIAD_FORCE_RESET` option is set to `YES`, the plugin will reset all VPU devices in the system.

 Single device cannot be shared across multiple processes.

--- a/docs/IE_DG/supported_plugins/Supported_Devices.md
+++ b/docs/IE_DG/supported_plugins/Supported_Devices.md
@@ -1,7 +1,7 @@
 Supported Devices {#openvino_docs_IE_DG_supported_plugins_Supported_Devices}
 ==================

-The Inference Engine can infer models in different formats with various input and output formats. This section provides supported and optimal configurations per device.
+The Inference Engine can infer models in different formats with various input and output formats. This section provides supported and optimal configurations per device. In OpenVINO™ documentation, "device" refers to an Intel® processors used for inference, which can be a supported CPU, GPU, VPU (vision processing unit), or GNA (Gaussian neural accelerator coprocessor), or a combination of those devices.

 > **NOTE**: With OpenVINO™ 2020.4 release, Intel® Movidius™ Neural Compute Stick is no longer supported.

@@ -13,7 +13,8 @@ The Inference Engine provides unique capabilities to infer deep learning models
 |[CPU plugin](CPU.md)              |Intel&reg; Xeon&reg; with Intel® Advanced Vector Extensions 2 (Intel® AVX2), Intel® Advanced Vector Extensions 512 (Intel® AVX-512), and AVX512_BF16, Intel&reg; Core&trade; Processors with Intel&reg; AVX2, Intel&reg; Atom&reg; Processors with Intel® Streaming SIMD Extensions (Intel® SSE) |
 |[VPU plugins](VPU.md) (available in the Intel® Distribution of OpenVINO™ toolkit)            |Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X, Intel® Vision Accelerator Design with Intel® Movidius™ VPUs                                                                                           |
 |[GNA plugin](GNA.md) (available in the Intel® Distribution of OpenVINO™ toolkit)              |Intel&reg; Speech Enabling Developer Kit, Amazon Alexa* Premium Far-Field Developer Kit, Intel&reg; Pentium&reg; Silver J5005 Processor, Intel&reg; Pentium&reg; Silver N5000 Processor, Intel&reg; Celeron&reg; J4005 Processor, Intel&reg; Celeron&reg; J4105 Processor, Intel&reg; Celeron&reg; Processor N4100, Intel&reg; Celeron&reg; Processor N4000, Intel&reg; Core&trade; i3-8121U Processor, Intel&reg; Core&trade; i7-1065G7 Processor, Intel&reg; Core&trade; i7-1060G7 Processor, Intel&reg; Core&trade; i5-1035G4 Processor, Intel&reg; Core&trade; i5-1035G7 Processor, Intel&reg; Core&trade; i5-1035G1 Processor, Intel&reg; Core&trade; i5-1030G7 Processor, Intel&reg; Core&trade; i5-1030G4 Processor, Intel&reg; Core&trade; i3-1005G1 Processor, Intel&reg; Core&trade; i3-1000G1 Processor, Intel&reg; Core&trade; i3-1000G4 Processor|
-|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel&reg; devices in parallel    |
+|[Multi-Device plugin](MULTI.md) |Multi-Device plugin enables simultaneous inference of the same network on several Intel&reg; devices in parallel    |   
+|[Auto-Device plugin](AUTO.md) |Auto-Device plugin enables selecting Intel&reg; device for inference automatically |   
 |[Heterogeneous plugin](HETERO.md) |Heterogeneous plugin enables automatic inference splitting between several Intel&reg; devices (for example if a device doesn't [support certain layers](#supported-layers)).                                                           |

 Devices similar to the ones we have used for benchmarking can be accessed using [Intel® DevCloud for the Edge](https://devcloud.intel.com/edge/), a remote development environment with access to Intel® hardware and the latest versions of the Intel® Distribution of the OpenVINO™ Toolkit. [Learn more](https://devcloud.intel.com/edge/get_started/devcloud/) or [Register here](https://inteliot.force.com/DevcloudForEdge/s/).
@@ -27,7 +28,6 @@ This page shows supported and optimal configurations for each plugin.

 | Acronym/Term      | Description                                   |
 | :-----------------| :---------------------------------------------|
-|   DL              | Deep Learning                                 |
 |   FP32 format     | Single-precision floating-point format        |
 |   BF16 format     | Brain floating-point format                   |
 |   FP16 format     | Half-precision floating-point format          |
@@ -36,7 +36,7 @@ This page shows supported and optimal configurations for each plugin.
 |   U16 format      | 2-byte unsigned integer format                |
 |   U8 format       | 1-byte unsigned integer format                |

-NHWC, NCHW, and NCDHW refer to the representation of batches of images.
+NHWC, NCHW, and NCDHW refer to the data ordering in batches of images:
 * NHWC and NCHW refer to image data layout.
 * NCDHW refers to image sequence data layout.

--- a/docs/IE_DG/supported_plugins/VPU.md
+++ b/docs/IE_DG/supported_plugins/VPU.md
@@ -1,20 +1,64 @@
 # VPU Plugins {#openvino_docs_IE_DG_supported_plugins_VPU}

+@sphinxdirective
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   
+   openvino_docs_IE_DG_supported_plugins_MYRIAD
+   openvino_docs_IE_DG_supported_plugins_HDDL
+    
+@endsphinxdirective
+
 This chapter provides information on the Inference Engine plugins that enable inference of deep learning models on the supported VPU devices:

 * Intel® Neural Compute Stick 2 powered by the Intel® Movidius™ Myriad™ X — Supported by the [MYRIAD Plugin](MYRIAD.md)
 * Intel® Vision Accelerator Design with Intel® Movidius™ VPUs — Supported by the [HDDL Plugin](HDDL.md)

-> **NOTE**: With OpenVINO™ 2020.4 release, Intel® Movidius™ Neural Compute Stick powered by the Intel® Movidius™ Myriad™ 2 is no longer supported.
+> **NOTE**: With the OpenVINO™ 2020.4 release, Intel® Movidius™ Neural Compute Stick powered by the Intel® Movidius™ Myriad™ 2 is no longer supported.

-## Known Layers Limitations
+## Supported Networks

-* `ScaleShift` layer is supported for zero value of `broadcast` attribute only.
-* `CTCGreedyDecoder` layer works with `ctc_merge_repeated` attribute equal 1.
-* `DetectionOutput` layer works with zero values of `interpolate_orientation` and `num_orient_classes` parameters only.
-* `MVN` layer uses fixed value for `eps` parameters (1e-9).
-* `Normalize` layer uses fixed value for `eps` parameters (1e-9) and is supported for zero value of `across_spatial` only.
-* `Pad` layer works only with 4D tensors.
+**Caffe\***:
+* AlexNet
+* CaffeNet
+* GoogleNet (Inception) v1, v2, v4
+* VGG family (VGG16, VGG19)
+* SqueezeNet v1.0, v1.1
+* ResNet v1 family (18\*\*\*, 50, 101, 152)
+* MobileNet (mobilenet-v1-1.0-224, mobilenet-v2)
+* Inception ResNet v2
+* DenseNet family (121,161,169,201)
+* SSD-300, SSD-512, SSD-MobileNet, SSD-GoogleNet, SSD-SqueezeNet
+
+**TensorFlow\***:
+* AlexNet
+* Inception v1, v2, v3, v4
+* Inception ResNet v2
+* MobileNet v1, v2
+* ResNet v1 family (50, 101, 152)
+* ResNet v2 family (50, 101, 152)
+* SqueezeNet v1.0, v1.1
+* VGG family (VGG16, VGG19)
+* Yolo family (yolo-v2, yolo-v3, tiny-yolo-v1, tiny-yolo-v2, tiny-yolo-v3)
+* faster_rcnn_inception_v2, faster_rcnn_resnet101
+* ssd_mobilenet_v1
+* DeepLab-v3+
+
+**MXNet\***:
+* AlexNet and CaffeNet
+* DenseNet family (121,161,169,201)
+* SqueezeNet v1.1
+* MobileNet v1, v2
+* NiN
+* ResNet v1 (101, 152)
+* ResNet v2 (101)
+* SqueezeNet v1.1
+* VGG family (VGG16, VGG19)
+* SSD-Inception-v3, SSD-MobileNet, SSD-ResNet-50, SSD-300
+
+\*\*\* Network is tested on Intel&reg; Neural Compute Stick 2 with BatchNormalization fusion optimization disabled during Model Optimizer import

 ## Optimizations

@@ -24,7 +68,7 @@ VPU plugins support layer fusion and decomposition.

 #### Fusing Rules

-Certain layers can be merged into Convolution, ReLU, and Eltwise layers according to the patterns below:
+Certain layers can be merged into convolution, ReLU, and Eltwise layers according to the patterns below:

 - Convolution
    - Convolution + ReLU → Convolution
@@ -46,6 +90,7 @@ Certain layers can be merged into Convolution, ReLU, and Eltwise layers accordin
 > **NOTE**: Application of these rules depends on tensor sizes and resources available.

 Layers can be joined only when the two conditions below are met:
+
 - Layers are located on topologically independent branches. 
 - Layers can be executed simultaneously on the same hardware units.

@@ -63,10 +108,9 @@ Layers can be joined only when the two conditions below are met:

 > **NOTE**: VPU plugins can add extra layers like Copy.

-
 ## VPU Common Configuration Parameters

-The VPU plugins supports the configuration parameters listed below.
+VPU plugins support the configuration parameters listed below.
 The parameters are passed as `std::map<std::string, std::string>` on `InferenceEngine::Core::LoadNetwork`
 or `InferenceEngine::Core::SetConfig`.
 When specifying key values as raw strings (that is, when using Python API), omit the `KEY_` prefix.
@@ -83,20 +127,28 @@ When specifying key values as raw strings (that is, when using Python API), omit
 ## Data Transfer Pipelining <a name="VPU_DATA_TRANSFER_PIPELINING">&nbsp;</a>

 MYRIAD plugin tries to pipeline data transfer to/from device with computations.
-While one infer request is executed the data for next infer request can be uploaded to device in parallel.
-Same applicable for result downloading.
+While one infer request is executed, the data for next infer request can be uploaded to device in parallel.
+The same applies to result downloading.

 `KEY_VPU_PRINT_RECEIVE_TENSOR_TIME` configuration parameter can be used to check the efficiency of current pipelining.
 The new record in performance counters will show the time that device spent waiting for input before starting the inference.
-In perfect pipeline this time should be near to zero, which means that the data was already transferred when new inference started.
+In a perfect pipeline this time should be near zero, which means that the data was already transferred when new inference started.

 ## Troubleshooting

 **Get the following message when running inference with the VPU plugin: "[VPU] Cannot convert layer <layer_name> due to unsupported layer type <layer_type>"**

 This means that your topology has a layer that is unsupported by your target VPU plugin. To resolve this issue, you can implement the custom layer for the target device using the [Inference Engine Extensibility mechanism](../Extensibility_DG/Intro.md). Or, to quickly get a working prototype, you can use the heterogeneous scenario with the default fallback policy (see the [HETERO Plugin](HETERO.md) section). Use the HETERO plugin with a fallback device that supports this layer, for example, CPU: `HETERO:MYRIAD,CPU`.
-For a list of VPU supported layers, see the Supported Layers section of the [Supported Devices](Supported_Devices.md) topic.
+For a list of VPU-supported layers, see the Supported Layers section of the [Supported Devices](Supported_Devices.md) page.

+## Known Layers Limitations
+
+* `ScaleShift` layer is supported for zero value of `broadcast` attribute only.
+* `CTCGreedyDecoder` layer works with the `ctc_merge_repeated` attribute equal to 1.
+* `DetectionOutput` layer works with zero values of `interpolate_orientation` and `num_orient_classes` parameters only.
+* `MVN` layer uses fixed value for `eps` parameters (1e-9).
+* `Normalize` layer uses fixed value for `eps` parameters (1e-9) and is supported for zero value of `across_spatial` only.
+* `Pad` layer works only with 4D tensors.

 ## See Also

--- a/docs/IE_PLUGIN_DG/AsyncInferRequest.md
+++ b/docs/IE_PLUGIN_DG/AsyncInferRequest.md
@@ -1,4 +1,4 @@
-# Asynchronous Inference Request {#async_infer_request}
+# Asynchronous Inference Request {#openvino_docs_ie_plugin_dg_async_infer_request}

 Asynchronous Inference Request runs an inference pipeline asynchronously in one or several task executors depending on a device pipeline structure.
 Inference Engine Plugin API provides the base InferenceEngine::AsyncInferRequestThreadSafeDefault class:
@@ -16,7 +16,7 @@ Inference Engine Plugin API provides the base InferenceEngine::AsyncInferRequest

 #### Class Fields

- `_inferRequest` - a reference to the [synchronous inference request](@ref infer_request) implementation. Its methods are reused in the `AsyncInferRequest` constructor to define a device pipeline.
+- `_inferRequest` - a reference to the [synchronous inference request](@ref openvino_docs_ie_plugin_dg_infer_request) implementation. Its methods are reused in the `AsyncInferRequest` constructor to define a device pipeline.
 - `_waitExecutor` - a task executor that waits for a response from a device about device tasks completion

 > **NOTE**: If a plugin can work with several instances of a device, `_waitExecutor` must be device-specific. Otherwise, having a single task executor for several devices does not allow them to work in parallel.
--- a/docs/IE_PLUGIN_DG/Building.md
+++ b/docs/IE_PLUGIN_DG/Building.md
@@ -1,4 +1,4 @@
-# Build Plugin Using CMake* {#plugin_build}
+# Build Plugin Using CMake* {#openvino_docs_ie_plugin_dg_plugin_build}

 Inference Engine build infrastructure provides the Inference Engine Developer Package for plugin development.

@@ -35,7 +35,7 @@ Once the commands above are executed, the Inference Engine Developer Package is
       * `IE::ngraphFunctions` - static library with the set of `ngraph::Function` builders
       * `IE::funcSharedTests` - static library with common functional tests

-> **Note:** it's enough just to run `cmake --build . --target ie_dev_targets` command to build only targets from the
+> **NOTE**: it's enough just to run `cmake --build . --target ie_dev_targets` command to build only targets from the
 > Inference Engine Developer package.

 Build Plugin using Inference Engine Developer Package
--- a/docs/IE_PLUGIN_DG/ExecutableNetwork.md
+++ b/docs/IE_PLUGIN_DG/ExecutableNetwork.md
@@ -1,4 +1,4 @@
-# Executable Network {#executable_network}
+# Executable Network {#openvino_docs_ie_plugin_dg_executable_network}

 `ExecutableNetwork` class functionality:
 - Compile an InferenceEngine::ICNNNetwork instance to a backend specific graph representation
@@ -63,8 +63,8 @@ The implementation of the method should write all data to the `model` stream, wh

 The method creates an asynchronous inference request and returns it. While the public Inference Engine API has a single interface for inference request, which can be executed in synchronous and asynchronous modes, a plugin library implementation has two separate classes:

- [Synchronous inference request](@ref infer_request), which defines pipeline stages and runs them synchronously in the `Infer` method.
- [Asynchronous inference request](@ref async_infer_request), which is a wrapper for a synchronous inference request and can run a pipeline asynchronously. Depending on a device pipeline structure, it can has one or several stages:
+- [Synchronous inference request](@ref openvino_docs_ie_plugin_dg_infer_request), which defines pipeline stages and runs them synchronously in the `Infer` method.
+- [Asynchronous inference request](@ref openvino_docs_ie_plugin_dg_async_infer_request), which is a wrapper for a synchronous inference request and can run a pipeline asynchronously. Depending on a device pipeline structure, it can has one or several stages:
   - For single-stage pipelines, there is no need to define this method and create a class derived from InferenceEngine::AsyncInferRequestThreadSafeDefault. For single stage pipelines, a default implementation of this method creates InferenceEngine::AsyncInferRequestThreadSafeDefault wrapping a synchronous inference request and runs it asynchronously in the `_taskExecutor` executor.
   - For pipelines with multiple stages, such as performing some preprocessing on host, uploading input data to a device, running inference on a device, or downloading and postprocessing output data, schedule stages on several task executors to achieve better device use and performance. You can do it by creating a sufficient number of inference requests running in parallel. In this case, device stages of different inference requests are overlapped with preprocessing and postprocessing stage giving better performance.
   > **IMPORTANT**: It is up to you to decide how many task executors you need to optimally execute a device pipeline.
@@ -73,7 +73,7 @@ The method creates an asynchronous inference request and returns it. While the p

 ### `CreateInferRequestImpl()`

-This is a helper method used by `CreateInferRequest` to create a [synchronous inference request](@ref infer_request), which is later wrapped with the asynchronous inference request class:
+This is a helper method used by `CreateInferRequest` to create a [synchronous inference request](@ref openvino_docs_ie_plugin_dg_infer_request), which is later wrapped with the asynchronous inference request class:

@snippet src/template_executable_network.cpp executable_network:create_infer_request_impl

@@ -97,4 +97,4 @@ Returns a current value for a configuration key with the name `name`. The method

 This function is the only way to get configuration values when a network is imported and compiled by other developers and tools (for example, the [Compile tool](../_inference_engine_tools_compile_tool_README.html)).

-The next step in plugin library implementation is the [Synchronous Inference Request](@ref infer_request) class.
+The next step in plugin library implementation is the [Synchronous Inference Request](@ref openvino_docs_ie_plugin_dg_infer_request) class.
--- a/docs/IE_PLUGIN_DG/InferRequest.md
+++ b/docs/IE_PLUGIN_DG/InferRequest.md
@@ -1,8 +1,8 @@
-# Synchronous Inference Request {#infer_request}
+# Synchronous Inference Request {#openvino_docs_ie_plugin_dg_infer_request}

 `InferRequest` class functionality:
 - Allocate input and output blobs needed for a backend-dependent network inference.
- Define functions for inference process stages (for example, `preprocess`, `upload`, `infer`, `download`, `postprocess`). These functions can later be used to define an execution pipeline during [Asynchronous Inference Request](@ref async_infer_request) implementation.
+- Define functions for inference process stages (for example, `preprocess`, `upload`, `infer`, `download`, `postprocess`). These functions can later be used to define an execution pipeline during [Asynchronous Inference Request](@ref openvino_docs_ie_plugin_dg_async_infer_request) implementation.
 - Call inference stages one by one synchronously.

 `InferRequest` Class
@@ -80,4 +80,4 @@ The method sets performance counters which were measured during pipeline stages

@snippet src/template_infer_request.cpp infer_request:get_performance_counts

-The next step in the plugin library implementation is the [Asynchronous Inference Request](@ref async_infer_request) class.
+The next step in the plugin library implementation is the [Asynchronous Inference Request](@ref openvino_docs_ie_plugin_dg_async_infer_request) class.
--- a/docs/IE_PLUGIN_DG/Intro.md
+++ b/docs/IE_PLUGIN_DG/Intro.md
@@ -1,4 +1,18 @@
-@mainpage Overview of Inference Engine Plugin Library
+# Overview of Inference Engine Plugin Library {#openvino_docs_ie_plugin_dg_overview}
+
+@sphinxdirective
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Converting and Preparing Models
+   :hidden:
+
+   Implement Plugin Functionality <openvino_docs_ie_plugin_dg_plugin>
+   Implement Executable Network Functionality <openvino_docs_ie_plugin_dg_executable_network>
+   Implement Synchronous Inference Request <openvino_docs_ie_plugin_dg_infer_request>
+   Implement Asynchronous Inference Request <openvino_docs_ie_plugin_dg_async_infer_request>
+
+@endsphinxdirective

 The plugin architecture of the Inference Engine allows to develop and plug independent inference 
 solutions dedicated to different devices. Physically, a plugin is represented as a dynamic library 
@@ -9,23 +23,23 @@ Inference Engine Plugin Library

 Inference Engine plugin dynamic library consists of several main components:

-1. [Plugin class](@ref plugin):
+1. [Plugin class](@ref openvino_docs_ie_plugin_dg_plugin):
 	- Provides information about devices of a specific type.
-	- Can create an [executable network](@ref executable_network) instance which represents a Neural 
+	- Can create an [executable network](@ref openvino_docs_ie_plugin_dg_executable_network) instance which represents a Neural 
 	Network backend specific graph structure for a particular device in opposite to the InferenceEngine::ICNNNetwork 
 	interface which is backend-independent.
 	- Can import an already compiled graph structure from an input stream to an 
-	[executable network](@ref executable_network) object.
-2. [Executable Network class](@ref executable_network):
+	[executable network](@ref openvino_docs_ie_plugin_dg_executable_network) object.
+2. [Executable Network class](@ref openvino_docs_ie_plugin_dg_executable_network):
 	- Is an execution configuration compiled for a particular device and takes into account its capabilities.
 	- Holds a reference to a particular device and a task executor for this device.
-	- Can create several instances of [Inference Request](@ref infer_request).
+	- Can create several instances of [Inference Request](@ref openvino_docs_ie_plugin_dg_infer_request).
 	- Can export an internal backend specific graph structure to an output stream.
-3. [Inference Request class](@ref infer_request):
+3. [Inference Request class](@ref openvino_docs_ie_plugin_dg_infer_request):
    - Runs an inference pipeline serially.
    - Can extract performance counters for an inference pipeline execution profiling.
-4. [Asynchronous Inference Request class](@ref async_infer_request):
-    - Wraps the [Inference Request](@ref infer_request) class and runs pipeline stages in parallel 
+4. [Asynchronous Inference Request class](@ref openvino_docs_ie_plugin_dg_async_infer_request):
+    - Wraps the [Inference Request](@ref openvino_docs_ie_plugin_dg_infer_request) class and runs pipeline stages in parallel 
 	on several task executors based on a device-specific pipeline structure.

 > **NOTE**: This documentation is written based on the `Template` plugin, which demonstrates plugin 
@@ -35,13 +49,13 @@ at `<dldt source dir>/docs/template_plugin`.
 Detailed guides
 -----------------------

-* [Build](@ref plugin_build) a plugin library using CMake\*
-* Plugin and its components [testing](@ref plugin_testing)
-* [Quantized networks](@ref quantized_networks)
-* [Writing ngraph transformations](@ref ngraph_transformation) guide
+* [Build](@ref openvino_docs_ie_plugin_dg_plugin_build) a plugin library using CMake\*
+* Plugin and its components [testing](@ref openvino_docs_ie_plugin_dg_plugin_testing)
+* [Quantized networks](@ref openvino_docs_ie_plugin_dg_quantized_networks)
+* [Writing nGraph transformations](@ref ngraph_transformation) guide

 API References
 -----------------------

-* [Inference Engine Plugin API](group__ie__dev__api.html)
-* [Inference Engine Transformation API](group__ie__transformation__api.html)
+* [Inference Engine Plugin API](groupie_dev_api.html)
+* [Inference Engine Transformation API](groupie_transformation_api.html)
--- a/docs/IE_PLUGIN_DG/LowPrecisionModelRepresentation.md
+++ b/docs/IE_PLUGIN_DG/LowPrecisionModelRepresentation.md
@@ -1,4 +1,4 @@
-# Representation of low-precision models {#lp_representation}
+# Representation of low-precision models {#openvino_docs_ie_plugin_dg_lp_representation}
 The goal of this document is to describe how optimized models are represented in OpenVINO Intermediate Representation (IR) and provide guidance on interpretation rules for such models at runtime. 
 Currently, there are two groups of optimization methods that can influence on the IR after applying them to the full-precision model:
 - **Sparsity**. It is represented by zeros inside the weights and this is up to the hardware plugin how to interpret these zeros (use weights as is or apply special compression algorithms and sparse arithmetic). No additional mask is provided with the model.
--- a/docs/IE_PLUGIN_DG/Plugin.md
+++ b/docs/IE_PLUGIN_DG/Plugin.md
@@ -1,4 +1,4 @@
-# Plugin {#plugin}
+# Plugin {#openvino_docs_ie_plugin_dg_plugin}

 Inference Engine Plugin usually represents a wrapper around a backend. Backends can be:
 - OpenCL-like backend (e.g. clDNN library) for GPU devices.
@@ -8,7 +8,7 @@ Inference Engine Plugin usually represents a wrapper around a backend. Backends
 The responsibility of Inference Engine Plugin:
 - Initializes a backend and throw exception in `Engine` constructor if backend cannot be initialized.
 - Provides information about devices enabled by a particular backend, e.g. how many devices, their properties and so on.
- Loads or imports [executable network](@ref executable_network) objects.
+- Loads or imports [executable network](@ref openvino_docs_ie_plugin_dg_executable_network) objects.

 In addition to the Inference Engine Public API, the Inference Engine provides the Plugin API, which is a set of functions and helper classes that simplify new plugin development:

@@ -16,7 +16,7 @@ In addition to the Inference Engine Public API, the Inference Engine provides th
 - implementations in the `inference_engine/src/inference_engine` directory
 - symbols in the Inference Engine Core shared library

-To build an Inference Engine plugin with the Plugin API, see the [Inference Engine Plugin Building](@ref plugin_build) guide.  
+To build an Inference Engine plugin with the Plugin API, see the [Inference Engine Plugin Building](@ref openvino_docs_ie_plugin_dg_plugin_build) guide.  

 Plugin Class
 ------------------------
@@ -39,7 +39,7 @@ The provided plugin class also has several fields:
 As an example, a plugin configuration has three value parameters:

 - `deviceId` - particular device ID to work with. Applicable if a plugin supports more than one `Template` device. In this case, some plugin methods, like `SetConfig`, `QueryNetwork`, and `LoadNetwork`, must support the CONFIG_KEY(KEY_DEVICE_ID) parameter. 
- `perfCounts` - boolean value to identify whether to collect performance counters during [Inference Request](@ref infer_request) execution.
+- `perfCounts` - boolean value to identify whether to collect performance counters during [Inference Request](@ref openvino_docs_ie_plugin_dg_infer_request) execution.
 - `_streamsExecutorConfig` - configuration of `InferenceEngine::IStreamsExecutor` to handle settings of multi-threaded context.

 ### Engine Constructor
@@ -69,7 +69,7 @@ InferenceEngine::ICNNNetwork object is supported by a device. In the example abo

 The very important part before creation of `ExecutableNetwork` instance is to call `TransformNetwork` method which applies ngraph transformation passes.

-Actual graph compilation is done in the `ExecutableNetwork` constructor. Refer to the [ExecutableNetwork Implementation Guide](@ref executable_network) for details.
+Actual graph compilation is done in the `ExecutableNetwork` constructor. Refer to the [ExecutableNetwork Implementation Guide](@ref openvino_docs_ie_plugin_dg_executable_network) for details.

 > **NOTE**: Actual configuration map used in `ExecutableNetwork` is constructed as a base plugin 
 > configuration set via `Plugin::SetConfig`, where some values are overwritten with `config` passed to `Plugin::LoadExeNetworkImpl`. 
@@ -82,7 +82,7 @@ The function accepts a const shared pointer to `ngraph::Function` object and per
 1. Deep copies a const object to a local object, which can later be modified.
 2. Applies common and plugin-specific transformations on a copied graph to make the graph more friendly to hardware operations. For details how to write custom plugin-specific transformation, please, refer to [Writing ngraph transformations](@ref ngraph_transformation) guide. See detailed topics about network representation:
    * [Intermediate Representation and Operation Sets](../_docs_MO_DG_IR_and_opsets.html)
-    * [Quantized networks](@ref quantized_networks).
+    * [Quantized networks](@ref openvino_docs_ie_plugin_dg_quantized_networks).

@snippet src/template_plugin.cpp plugin:transform_network

@@ -162,7 +162,7 @@ The snippet below provides an example of the implementation for `GetMetric`:
 ### `ImportNetwork()`

 The importing network mechanism allows to import a previously exported backend specific graph and wrap it 
-using an [ExecutableNetwork](@ref executable_network) object. This functionality is useful if 
+using an [ExecutableNetwork](@ref openvino_docs_ie_plugin_dg_executable_network) object. This functionality is useful if 
 backend specific graph compilation takes significant time and/or cannot be done on a target host 
 device due to other reasons.

@@ -187,4 +187,4 @@ Inference Engine plugin library must export only one function creating a plugin

@snippet src/template_plugin.cpp plugin:create_plugin_engine

-Next step in a plugin library implementation is the [ExecutableNetwork](@ref executable_network) class.
+Next step in a plugin library implementation is the [ExecutableNetwork](@ref openvino_docs_ie_plugin_dg_executable_network) class.
--- a/Show More
+++ b/Show More