diff --git a/.ci/azure/linux_ngraph_onnx.yml b/.ci/azure/linux_ngraph_onnx.yml
index 1e13710f2c2..28326c89053 100644
--- a/.ci/azure/linux_ngraph_onnx.yml
+++ b/.ci/azure/linux_ngraph_onnx.yml
@@ -1,5 +1,20 @@
 jobs:
-- job: nGraph_ONNX_Lin
+- job: OpenVINO_ONNX_CI
+  strategy:
+    matrix:
+      Release:
+        BUILD_TYPE: 'Release'
+        PROTOBUF_LITE: 'OFF'
+        TOX_COMMAND: 'tox && tox -e zoo_models'
+      Debug:
+        BUILD_TYPE: 'Debug'
+        PROTOBUF_LITE: 'OFF'
+        TOX_COMMAND: 'tox'
+      Protobuf_lite:
+        BUILD_TYPE: 'Release'
+        PROTOBUF_LITE: 'ON'
+        TOX_COMMAND: 'tox && tox -e zoo_models'
+    maxParallel: 3
 
   # About 300% of total time
   timeoutInMinutes: 90
@@ -12,7 +27,6 @@ jobs:
     VSTS_HTTP_RETRY: 5
     VSTS_HTTP_TIMEOUT: 200
     WORKERS_NUMBER: 8
-    BUILD_TYPE: Release
     REPO_DIR: $(Build.Repository.LocalPath)
     WORK_DIR: $(Pipeline.Workspace)/_w
     MODELS_DIR: /mount/cinfsshare/onnxtestdata
@@ -54,31 +68,16 @@ jobs:
     submodules: recursive
     path: openvino
 
-  - script: docker build --tag=openvino-onnx-ci-image --file=.ci/openvino-onnx/Dockerfile .
-    displayName: 'Docker build'
+  - script: docker build --tag=openvino-onnx-ci-image --file=.ci/openvino-onnx/Dockerfile --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg PROTOBUF_LITE=$(PROTOBUF_LITE) .
+    displayName: 'Docker build $(BUILD_TYPE) protobuf-lite: $(PROTOBUF_LITE)'
 
   - script: ngraph/python/tests/test_onnx/model_zoo_preprocess.sh -d $(TMP_DIR) -o -s "$(ONNX_MODEL_ZOO_SHA)"
     displayName: 'Get models'
-
-  - script: |
-      ##wget -O "$(TMP_DIR)/msft.zip" https://onnxruntimetestdata.blob.core.windows.net/models/20191107.zip
-      ##unzip "$(TMP_DIR)/msft.zip" -d "$(MODELS_DIR)/msft"
-      #unzip "/mnt/onnxtestdata/models/20191107.zip" -d "$(MODELS_DIR)/msft"
-      #mv $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/seq_lens_sorted $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/test_data_set_0
-      #mv $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/seq_lens_unsorted $(MODELS_DIR)/msft/opset9/LSTM_Seq_lens_unpacked/test_data_set_1
-    displayName: 'Get MSFT models'
-    enabled: false
-
-  - script: |
-      ls -alR $(MODELS_DIR)
-      ls -alR $(TMP_DIR)
-    displayName: 'List models'
-    enabled: false
+    condition: ne(variables['BUILD_TYPE'], 'Debug')
 
   - script: sudo fallocate -l 48G /swapfile ; sudo mkswap /swapfile ; sudo swapon /swapfile ; df ; free -h
     displayName: 'Create swap'
 
   - script: |
-      docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo/onnx_model_zoo_$(ONNX_MODEL_ZOO_SHA):/root/.onnx/model_zoo/onnx_model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image /bin/bash -c "tox && tox -e zoo_models"
-    displayName: 'Docker run'
-
+      docker run --name openvino-onnx-ci-container --volume $(TMP_DIR)/model_zoo/onnx_model_zoo_$(ONNX_MODEL_ZOO_SHA):/root/.onnx/model_zoo/onnx_model_zoo --volume $(MODELS_DIR)/msft:/root/.onnx/model_zoo/MSFT openvino-onnx-ci-image /bin/bash -c "$(TOX_COMMAND)"
+    displayName: 'Docker run $(BUILD_TYPE) protobuf-lite: $(PROTOBUF_LITE)'
diff --git a/.gitmodules b/.gitmodules
index 5c1d9956c6a..d3f72b54c4f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -37,4 +37,16 @@
 [submodule "thirdparty/ocl/clhpp_headers"]
 	path = thirdparty/ocl/clhpp_headers
 	url = https://github.com/KhronosGroup/OpenCL-CLHPP.git
-	ignore = dirty
\ No newline at end of file
+	ignore = dirty
+[submodule "thirdparty/onnx"]
+	path = thirdparty/onnx/onnx
+	url = https://github.com/openvinotoolkit/onnx.git
+[submodule "thirdparty/protobuf"]
+	path = thirdparty/protobuf/protobuf
+	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "ngraph/python/pybind11"]
+	path = ngraph/python/pybind11
+	url = https://github.com/pybind/pybind11.git
+[submodule "thirdparty/ittapi/ittapi"]
+	path = thirdparty/ittapi/ittapi
+	url = https://github.com/intel/ittapi.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b796977515b..6019fe63d93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,12 +63,6 @@ function(build_ngraph)
         ngraph_set(NGRAPH_PDPD_FRONTEND_ENABLE OFF)
     endif()
 
-    if(ENABLE_PYTHON)
-        ngraph_set(NGRAPH_PYTHON_BUILD_ENABLE ON)
-    else()
-        ngraph_set(NGRAPH_PYTHON_BUILD_ENABLE OFF)
-    endif()
-
     if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
         ie_add_compiler_flags(-Wno-error=uninitialized -Wno-error=literal-conversion)
     elseif(UNIX)
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 498f6171fdd..e84a7cdc718 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -32,12 +32,12 @@ if(COMMAND get_linux_name)
 endif()
 
 if(CMAKE_CROSSCOMPILING AND CMAKE_HOST_SYSTEM_NAME MATCHES Linux AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
-    set(protoc_version "3.7.1")
+    set(protoc_version "3.9.2")
 
     RESOLVE_DEPENDENCY(SYSTEM_PROTOC_ROOT
         ARCHIVE_LIN "protoc-${protoc_version}-linux-x86_64.tar.gz"
         TARGET_PATH "${TEMP}/protoc-${protoc_version}-linux-x86_64"
-        SHA256 "a1bedd5c05ca51e49f8f254faa3d7331e05b3a806c151fb111d582f154d0fee8"
+        SHA256 "1d6da1d97d0cbfcd333558afe24533eb3cb48dc1e0ab5e971aa1e50ede8bcf45"
     )
     debug_message(STATUS "host protoc-${protoc_version} root path = " ${SYSTEM_PROTOC_ROOT})
 
diff --git a/cmake/developer_package/IEDevScriptsConfig.cmake b/cmake/developer_package/IEDevScriptsConfig.cmake
index 17d96a69920..46423aa61c7 100644
--- a/cmake/developer_package/IEDevScriptsConfig.cmake
+++ b/cmake/developer_package/IEDevScriptsConfig.cmake
@@ -249,6 +249,25 @@ function(ie_mark_target_as_cc TARGET_NAME)
     set_source_files_properties(${sources} PROPERTIES OBJECT_DEPENDS ${GENERATED_HEADER})
 endfunction()
 
+# check python package
+
+function(ie_check_pip_package name message_type)
+    find_package(PythonInterp 3 REQUIRED)
+
+    execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -m pip show ${name}
+        RESULT_VARIABLE PIP_EXIT_CODE
+        OUTPUT_QUIET
+    )
+
+    if(NOT PIP_EXIT_CODE EQUAL 0)
+        set(${name}_FOUND OFF PARENT_SCOPE)
+        message(${message_type} "${name} package is not installed. Please use \"${PYTHON_EXECUTABLE} -m pip install ${name}\".")
+    else()
+        set(${name}_FOUND ON PARENT_SCOPE)
+    endif()
+endfunction()
+
 # Code style utils
 
 include(cpplint/cpplint)
diff --git a/cmake/developer_package/api_validator/api_validator.cmake b/cmake/developer_package/api_validator/api_validator.cmake
index c12b4108f4b..204289dc83c 100644
--- a/cmake/developer_package/api_validator/api_validator.cmake
+++ b/cmake/developer_package/api_validator/api_validator.cmake
@@ -88,9 +88,12 @@ function(_ie_add_api_validator_post_build_step)
 
     macro(api_validator_get_target_name)
         get_target_property(IS_IMPORTED ${target} IMPORTED)
+        get_target_property(orig_target ${target} ALIASED_TARGET)
         if(IS_IMPORTED)
             get_target_property(target_location ${target} LOCATION)  
             get_filename_component(target_name "${target_location}" NAME_WE)
+        elseif(TARGET "${orig_target}")
+            set(target_name ${orig_target})
         else()
             set(target_name ${target})
         endif()
diff --git a/cmake/developer_package/compile_flags/sanitizer.cmake b/cmake/developer_package/compile_flags/sanitizer.cmake
index 3e73ef28076..dbf35196507 100644
--- a/cmake/developer_package/compile_flags/sanitizer.cmake
+++ b/cmake/developer_package/compile_flags/sanitizer.cmake
@@ -5,13 +5,36 @@
 include(CheckCXXCompilerFlag)
 
 if (ENABLE_SANITIZER)
-    set(SANITIZER_COMPILER_FLAGS "-g -fsanitize=address -fno-omit-frame-pointer")
-    CHECK_CXX_COMPILER_FLAG("-fsanitize-recover=address" SANITIZE_RECOVER_SUPPORTED)
-    if (SANITIZE_RECOVER_SUPPORTED)
+    set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize=address")
+    CHECK_CXX_COMPILER_FLAG("-fsanitize-recover=address" SANITIZE_RECOVER_ADDRESS_SUPPORTED)
+    if (SANITIZE_RECOVER_ADDRESS_SUPPORTED)
         set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize-recover=address")
     endif()
 
-    set(SANITIZER_LINKER_FLAGS "-fsanitize=address")
+    set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=address")
+endif()
+
+if (ENABLE_UB_SANITIZER)
+    # TODO: Remove -fno-sanitize=null as thirdparty/ocl/clhpp_headers UBSAN compatibility resolved:
+    # https://github.com/KhronosGroup/OpenCL-CLHPP/issues/17
+    set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize=undefined -fno-sanitize=null")
+    CHECK_CXX_COMPILER_FLAG("-fsanitize-recover=undefined" SANITIZE_RECOVER_UNDEFINED_SUPPORTED)
+    if (SANITIZE_RECOVER_UNDEFINED_SUPPORTED)
+        set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize-recover=undefined")
+    endif()
+
+    set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=undefined")
+endif()
+
+if (ENABLE_THREAD_SANITIZER)
+    set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize=thread")
+    set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fsanitize=thread")
+endif()
+
+# common sanitizer options
+if (DEFINED SANITIZER_COMPILER_FLAGS)
+    # ensure sumbols are present
+    set(SANITIZER_COMPILER_FLAGS "-g -fno-omit-frame-pointer")
     # prevent unloading libraries at runtime, so sanitizer can resolve their symbols
     set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -Wl,-z,nodelete")
 
@@ -28,23 +51,4 @@ if (ENABLE_SANITIZER)
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
     set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
-endif()
-
-if (ENABLE_THREAD_SANITIZER)
-    set(SANITIZER_COMPILER_FLAGS "-g -fsanitize=thread -fno-omit-frame-pointer")
-    set(SANITIZER_LINKER_FLAGS "-fsanitize=thread")
-    set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -Wl,-z,nodelete")
-
-    if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$" AND NOT WIN32)
-        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.0)
-            set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -fuse-ld=lld")
-        else()
-            set(SANITIZER_LINKER_FLAGS "${SANITIZER_LINKER_FLAGS} -static-libsan")
-        endif()
-    endif()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SANITIZER_COMPILER_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SANITIZER_COMPILER_FLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
-    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
-endif()
+endif()
\ No newline at end of file
diff --git a/cmake/developer_package/features.cmake b/cmake/developer_package/features.cmake
index 813b58b7dc0..487dea8c7e3 100644
--- a/cmake/developer_package/features.cmake
+++ b/cmake/developer_package/features.cmake
@@ -20,10 +20,12 @@ endif()
 # FIXME: ARM cross-compiler generates several "false positive" warnings regarding __builtin_memcpy buffer overflow
 ie_dependent_option (TREAT_WARNING_AS_ERROR "Treat build warnings as errors" ON "X86 OR X86_64" OFF)
 
-ie_option (ENABLE_INTEGRITYCHECK "build DLLs with /INTEGRITYCHECK flag" OFF)
+ie_dependent_option (ENABLE_INTEGRITYCHECK "build DLLs with /INTEGRITYCHECK flag" OFF "CMAKE_CXX_COMPILER_ID STREQUAL MSVC" OFF)
 
 ie_option (ENABLE_SANITIZER "enable checking memory errors via AddressSanitizer" OFF)
 
+ie_option (ENABLE_UB_SANITIZER "enable UndefinedBahavior sanitizer" OFF)
+
 ie_option (ENABLE_THREAD_SANITIZER "enable checking data races via ThreadSanitizer" OFF)
 
 ie_dependent_option (ENABLE_COVERAGE "enable code coverage" OFF "CMAKE_CXX_COMPILER_ID STREQUAL GNU" OFF)
diff --git a/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in b/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in
index 319fd765c6a..4aca14b72bd 100644
--- a/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in
+++ b/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in
@@ -13,7 +13,8 @@ set_and_check(IE_MAIN_SOURCE_DIR "@IE_MAIN_SOURCE_DIR@") # HDDL
 
 # Variables to export in plugin's projects
 
-set(ie_options "@IE_OPTIONS@;CMAKE_BUILD_TYPE;CMAKE_SKIP_RPATH")
+set(ie_options "@IE_OPTIONS@;CMAKE_BUILD_TYPE;CMAKE_SKIP_RPATH;")
+list(APPEND ie_options CMAKE_CXX_COMPILER_LAUNCHER CMAKE_C_COMPILER_LAUNCHER)
 file(TO_CMAKE_PATH "${CMAKE_CURRENT_LIST_DIR}" cache_path)
 
 message(STATUS "The following CMake options are exported from Inference Engine Developer package")
diff --git a/cmake/toolchains/oecore.arm64.toolchain.cmake b/cmake/toolchains/oecore.arm64.toolchain.cmake
new file mode 100644
index 00000000000..ef32d5990ec
--- /dev/null
+++ b/cmake/toolchains/oecore.arm64.toolchain.cmake
@@ -0,0 +1,50 @@
+#
+# Copyright 2020 Intel Corporation.
+#
+# LEGAL NOTICE: Your use of this software and any required dependent software
+# (the "Software Package") is subject to the terms and conditions of
+# the Intel(R) OpenVINO(TM) Distribution License for the Software Package,
+# which may also include notices, disclaimers, or license terms for
+# third party or open source software included in or with the Software Package,
+# and your use indicates your acceptance of all such terms. Please refer
+# to the "third-party-programs.txt" or other similarly-named text file
+# included with the Software Package for additional details.
+#
+
+if(DEFINED OECORE_BASE_DIR)
+    # OECORE_BASE_DIR was passed via CMake command line, nothing to do
+elseif(DEFINED ENV{OECORE_BASE_DIR})
+    # User sets OECORE_BASE_DIR environment variable
+    set(OECORE_BASE_DIR $ENV{OECORE_BASE_DIR})
+elseif(DEFINED ENV{OECORE_NATIVE_SYSROOT})
+    # OECORE_NATIVE_SYSROOT is a default environment variable for the OECore toolchain
+    set(OECORE_BASE_DIR "$ENV{OECORE_NATIVE_SYSROOT}/../..")
+else()
+    # Use default value
+    set(OECORE_BASE_DIR "/usr/local/oecore-x86_64")
+endif()
+
+set(OECORE_TARGET_NAME              "aarch64-ese-linux")
+set(OECORE_TARGET_SYSROOT           "${OECORE_BASE_DIR}/sysroots/${OECORE_TARGET_NAME}")
+set(OECORE_HOST_SYSROOT             "${OECORE_BASE_DIR}/sysroots/x86_64-esesdk-linux")
+set(OECORE_HOST_COMPILER_BIN_DIR    "${OECORE_HOST_SYSROOT}/usr/bin/${OECORE_TARGET_NAME}")
+
+set(CMAKE_SYSTEM_NAME       "Linux")
+set(CMAKE_SYSTEM_PROCESSOR  "aarch64")
+
+set(CMAKE_SYSROOT "${OECORE_TARGET_SYSROOT}")
+
+set(CMAKE_C_COMPILER    "${OECORE_HOST_COMPILER_BIN_DIR}/aarch64-ese-linux-gcc")
+set(CMAKE_CXX_COMPILER  "${OECORE_HOST_COMPILER_BIN_DIR}/aarch64-ese-linux-g++")
+
+set(CMAKE_C_FLAGS_INIT      "-mcpu=cortex-a53 -mtune=cortex-a53 --sysroot=${OECORE_TARGET_SYSROOT}")
+set(CMAKE_CXX_FLAGS_INIT    "-mcpu=cortex-a53 -mtune=cortex-a53 --sysroot=${OECORE_TARGET_SYSROOT}")
+
+set(CMAKE_EXE_LINKER_FLAGS_INIT     "-Wl,-O1 -Wl,--hash-style=gnu -Wl,--as-needed --sysroot=${OECORE_TARGET_SYSROOT}")
+set(CMAKE_SHARED_LINKER_FLAGS_INIT  "-Wl,-O1 -Wl,--hash-style=gnu -Wl,--as-needed --sysroot=${OECORE_TARGET_SYSROOT}")
+set(CMAKE_MODULE_LINKER_FLAGS_INIT  "-Wl,-O1 -Wl,--hash-style=gnu -Wl,--as-needed --sysroot=${OECORE_TARGET_SYSROOT}")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 9b3859a29c4..c9859464ee1 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -48,7 +48,6 @@ if(NOT ENABLE_DOCKER)
             LIBRARY DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT tests EXCLUDE_FROM_ALL)
 endif()
 
-set(LINKCHECKER_PY "" CACHE FILEPATH "Path to linkchecker.py for documentation check")
 set(OMZ_DOCS_DIR "" CACHE PATH "Path to open_model_zoo documentation")
 set(WORKBENCH_DOCS_DIR "" CACHE PATH "Path to workbench documentation")
 set(POT_DOCS_DIR "" CACHE PATH "Path to post-training-compression-tool documentation")
@@ -56,18 +55,14 @@ set(GST_DOCS_DIR "" CACHE PATH "Path to gst-video-analytics documentation")
 
 function(build_docs)
     find_package(Doxygen REQUIRED dot)
-    find_package(PythonInterp 3 REQUIRED)
     find_package(LATEX REQUIRED)
 
-    execute_process(
-        COMMAND ${PYTHON_EXECUTABLE} -m pip show lxml
-        RESULT_VARIABLE PIP_EXIT_CODE
-        OUTPUT_QUIET
-    )
+    ie_check_pip_package(lxml FATAL_ERROR)
+    ie_check_pip_package(LinkChecker WARNING)
 
-    if (NOT ${PIP_EXIT_CODE} EQUAL 0)
-        message(FATAL_ERROR "lxml package is not installed. Please use \"pip install lxml\".")
-    endif()
+    find_host_program(LINKCHECKER_PY
+                      NAMES linkchecker
+                      DOC "linkchecker tools for documentation check")
 
     set(DOCS_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}")
     set(DOXYGEN_DIR "${OpenVINO_SOURCE_DIR}/docs/doxygen")
@@ -357,7 +352,7 @@ function(build_docs)
 
     if(EXISTS "${LINKCHECKER_PY}")
         add_custom_target(docs_check
-                            COMMAND ${PYTHON_EXECUTABLE} "${LINKCHECKER_PY}" -v "${DOCS_BUILD_DIR}/html/"
+                            COMMAND "${LINKCHECKER_PY}" -v "${DOCS_BUILD_DIR}/html/"
                             COMMENT "Check links in generated documentation"
                             WORKING_DIRECTORY "${DOCS_BUILD_DIR}"
                             VERBATIM)
diff --git a/docs/IE_DG/Integrate_with_customer_application_new_API.md b/docs/IE_DG/Integrate_with_customer_application_new_API.md
index 9e35f483717..93482a90938 100644
--- a/docs/IE_DG/Integrate_with_customer_application_new_API.md
+++ b/docs/IE_DG/Integrate_with_customer_application_new_API.md
@@ -210,11 +210,6 @@ It's allowed to specify additional build options (e.g. to build CMake project on
 
 ### Run Your Application
 
-> **NOTE**: Before running, make sure you completed **Set the Environment Variables** section in [OpenVINO Installation](../../inference-engine/samples/hello_nv12_input_classification/README.md) document so that the application can find the libraries.
-
-To run compiled applications on Microsoft* Windows* OS, make sure that Microsoft* Visual C++ 2017
-Redistributable and Intel® C++ Compiler 2017 Redistributable packages are installed and
-`<INSTALL_DIR>/bin/intel64/Release/*.dll` files are placed to the
-application folder or accessible via `%PATH%` environment variable.
+Before running, make sure you completed **Set the Environment Variables** section in [OpenVINO Installation](../../inference-engine/samples/hello_nv12_input_classification/README.md) document so that the application can find the libraries.
 
 [integration_process]: img/integration_process.png
diff --git a/docs/doxygen/doxygen-ignore.txt b/docs/doxygen/doxygen-ignore.txt
index 7f963ac63e7..b1f27a4972c 100644
--- a/docs/doxygen/doxygen-ignore.txt
+++ b/docs/doxygen/doxygen-ignore.txt
@@ -1,6 +1,5 @@
 openvino/inference-engine/samples/hello_reshape_ssd/README.md
 openvino/docs/index.md
-inference-engine/include/ie_icnn_network.hpp
 openvino/docs/get_started/get_started_dl_workbench.md
 openvino/docs/get_started/get_started_linux.md
 openvino/docs/get_started/get_started_raspbian.md
@@ -11,25 +10,14 @@ openvino/docs/install_guides/deployment-manager-tool.md
 openvino/docs/MO_DG/prepare_model/customize_model_optimizer/Customize_Model_Optimizer.md
 openvino/docs/ovsa/ovsa_get_started.md
 openvino/inference-engine/ie_bridges/c/docs/api_overview.md
-inference-engine/include/cpp/ie_infer_request.hpp
-inference-engine/include/ie_parallel.hpp
-inference-engine/include/gpu/gpu_context_api_ocl.hpp
-inference-engine/include/gpu/gpu_context_api_va.hpp
-inference-engine/include/ie_plugin_config.hpp
-inference-engine/include/ie_unicode.hpp
-inference-engine/include/vpu/myriad_config.hpp
-inference-engine/include/vpu/vpu_config.hpp
-inference-engine/include/vpu/vpu_plugin_config.hpp
 openvino/docs/benchmarks/performance_int8_vs_fp32.md
 openvino/docs/get_started/get_started_macos.md
 openvino/docs/optimization_guide/dldt_optimization_guide.md
 openvino/docs/IE_DG/ShapeInference.md
-inference-engine/include/details/ie_so_pointer.hpp
-inference-engine/include/ie_compound_blob.h
-inference-engine/include/ie_data.h
-inference-engine/include/ie_blob.h
-inference-engine/include/ie_precision.hpp
-inference-engine/include/ie_remote_context.hpp
-inference-engine/include/gpu/gpu_context_api_dx.hpp
 build/docs/openvino_docs.xml
-openvino/docs/install_guides/installing-openvino-linux-ivad-vpu.md
\ No newline at end of file
+openvino/docs/install_guides/installing-openvino-linux-ivad-vpu.md
+inference-engine/include/ie_parallel.hpp
+inference-engine/include/ie_plugin_config.hpp
+inference-engine/include/vpu/myriad_config.hpp
+inference-engine/include/vpu/vpu_config.hpp
+inference-engine/include/vpu/vpu_plugin_config.hpp
\ No newline at end of file
diff --git a/docs/doxygen/ie_docs.config b/docs/doxygen/ie_docs.config
index db424ce7adc..bbd203c931c 100644
--- a/docs/doxygen/ie_docs.config
+++ b/docs/doxygen/ie_docs.config
@@ -913,12 +913,14 @@ EXCLUDE_SYMBOLS        = InferenceEngine::details \
                          DECLARE_*METRIC_KEY \
                          DECLARE_*METRIC_VALUE \
                          DECLARE_*CONFIG_KEY \
+                         DECLARE_VPU_CONFIG \
+                         VPU_CONFIG_KEY \
+                         VPU_CONFIG_VALUE \
+                         VPU_METRIC \
                          DECLARE_*CONFIG_VALUE \
                          DECLARE_PARAM_KEY_IMPL \
                          TBB_PREVIEW_LOCAL_OBSERVER \
                          PARTITIONING \
-                         CALL_STATUS_FNC* \
-                         CALL_FNC* \
                          __PRETTY_FUNCTION__ \
                          PRINT_COLOR_FORMAT \
                          PRINT_LAYOUT \
@@ -943,6 +945,8 @@ EXCLUDE_SYMBOLS        = InferenceEngine::details \
                          InferenceEngine::parallel_* \
                          NOMINMAX \
                          TBB_PREVIEW_NUMA_SUPPORT \
+                         TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION \
+                         _TBB_REDUCE_FUNC \
                          IE_THREAD_*
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
diff --git a/docs/ops/arithmetic/Atan_1.md b/docs/ops/arithmetic/Atan_1.md
index dc4c3b6d824..7fc9525bf66 100644
--- a/docs/ops/arithmetic/Atan_1.md
+++ b/docs/ops/arithmetic/Atan_1.md
@@ -6,31 +6,27 @@
 
 **Short description**: *Atan* performs element-wise inverse tangent (arctangent) operation with given tensor.
 
-**Attributes**:
-
-    No attributes available.
-
-**Inputs**
-
-* **1**: An tensor of type *T*. **Required.**
-
-**Outputs**
-
-* **1**: The result of element-wise atan operation. A tensor of type *T*.
-
-**Types**
-
-* *T*: any numeric type.
-
-*atan* does the following with the input tensor *a*:
+**Detailed description**:  Operation takes one input tensor and performs the element-wise inverse tangent function on a given input tensor, based on the following mathematical formula:
 
 \f[
 a_{i} = atan(a_{i})
 \f]
 
-**Examples**
+**Attributes**: *Atan* operation has no attributes.
 
-*Example 1*
+**Inputs**
+
+* **1**: A tensor of type *T* and arbitrary shape. **Required.**
+
+**Outputs**
+
+* **1**: The result of element-wise *Atan* applied to the input tensor. A tensor of type *T* and same shape as the input tensor.
+
+**Types**
+
+* *T*: any supported numeric type.
+
+**Examples**
 
 ```xml
 <layer ... type="Atan">
diff --git a/docs/ops/arithmetic/Sinh_1.md b/docs/ops/arithmetic/Sinh_1.md
index 94a724acfe9..0f0c83b63e1 100644
--- a/docs/ops/arithmetic/Sinh_1.md
+++ b/docs/ops/arithmetic/Sinh_1.md
@@ -4,11 +4,15 @@
 
 **Category**: Arithmetic unary operation
 
-**Short description**: *Sinh* performs element-wise hyperbolic sine (sinh) operation with given tensor.
+**Short description**: *Sinh* performs element-wise hyperbolic sine (sinh) operation on a given input tensor
 
-**Attributes**:
+**Detailed description**: *Sinh* performs element-wise hyperbolic sine (sinh) operation on a given input tensor, based on the following mathematical formula:
 
-    No attributes available.
+\f[
+a_{i} = sinh(a_{i})
+\f]
+
+**Attributes**: *Sinh* operation has no attributes.
 
 **Inputs**
 
@@ -16,21 +20,13 @@
 
 **Outputs**
 
-* **1**: The result of element-wise sinh operation. A tensor of type *T*.
+* **1**: The result of element-wise *Sinh* operation applied to the input tensor. A tensor of type *T* and the same shape as input tensor.
 
 **Types**
 
-* *T*: any numeric type.
+* *T*: any supported numeric type.
 
-*sinh* does the following with the input tensor *a*:
-
-\f[
-a_{i} = sinh(a_{i})
-\f]
-
-**Examples**
-
-*Example 1*
+**Example**
 
 ```xml
 <layer ... type="Sinh">
diff --git a/docs/ops/pooling/AdaptiveMaxPool_8.md b/docs/ops/pooling/AdaptiveMaxPool_8.md
index d7ad9a42412..286e1ab78a2 100644
--- a/docs/ops/pooling/AdaptiveMaxPool_8.md
+++ b/docs/ops/pooling/AdaptiveMaxPool_8.md
@@ -44,7 +44,7 @@ Output(i,j,k) = max(Input[d_{start}:d_{end}, h_{start}:h_{end}, w_{start}:w_{end
 **Outputs**:
 
 *   **1**: Output of type *T* and shape `[N, C, H_out]`, `[N, C, H_out, W_out]` or `[N, C, D_out, H_out, W_out]`.
-*   **2**: Output of type specified by *index_element_type* and same shape as the first output containing indices of elements in the first output. The values of indices are computed as if input was flatten 1-D tensor, so the values are in the range `[0, N * C * H * W * D)`.
+*   **2**: Output of type specified by *index_element_type* and same shape as the first output containing indices of elements in the first output. The values of indices are computed as if input spatial dimensions were flatten, so the values are in the range `[0, H * W * D)`.
 
 **Types**
 
diff --git a/inference-engine/ie_bridges/c/src/ie_c_api.cpp b/inference-engine/ie_bridges/c/src/ie_c_api.cpp
index 4346047c1b5..3b581db0abb 100644
--- a/inference-engine/ie_bridges/c/src/ie_c_api.cpp
+++ b/inference-engine/ie_bridges/c/src/ie_c_api.cpp
@@ -235,9 +235,8 @@ IEStatusCode ie_core_create(const char *xml_config_file, ie_core_t **core) {
 
     IEStatusCode status = IEStatusCode::OK;
     try {
-        std::unique_ptr<ie_core_t> tmp(new ie_core_t);
-        tmp->object = IE::Core(xml_config_file);
-        *core = tmp.release();
+        auto object = IE::Core(xml_config_file);
+        *core = new ie_core_t { std::move(object) };
     } CATCH_IE_EXCEPTIONS
 
     return status;
diff --git a/inference-engine/ie_bridges/python/CMakeLists.txt b/inference-engine/ie_bridges/python/CMakeLists.txt
index 89cdb5cc17f..7b93a4291a2 100644
--- a/inference-engine/ie_bridges/python/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/CMakeLists.txt
@@ -68,7 +68,7 @@ if(ENABLE_WHEEL)
     add_subdirectory(wheel)
 endif()
 
-if (NGRAPH_PYTHON_BUILD_ENABLE)
+if(TARGET _pyngraph)
     add_dependencies(ie_api _pyngraph)
 endif()
 
diff --git a/inference-engine/ie_bridges/python/README.md b/inference-engine/ie_bridges/python/README.md
index 0b50a0e2a6a..b40a6ded9b0 100644
--- a/inference-engine/ie_bridges/python/README.md
+++ b/inference-engine/ie_bridges/python/README.md
@@ -61,11 +61,6 @@ sudo apt install patchelf
 -DENABLE_PYTHON=ON
 -DENABLE_WHEEL=ON
 ```
-If you need to include other components to the package you need to enable them too.
-For example, to include ngraph python API:
-```shellscript
--NGRAPH_PYTHON_BUILD_ENABLE=ON
-```
 
 ## Running sample
 
diff --git a/inference-engine/include/cpp/ie_executable_network.hpp b/inference-engine/include/cpp/ie_executable_network.hpp
index 81d5b10e7dd..870f6c2a6fc 100644
--- a/inference-engine/include/cpp/ie_executable_network.hpp
+++ b/inference-engine/include/cpp/ie_executable_network.hpp
@@ -66,7 +66,6 @@ public:
      * This method need to be called to find out input names for using them later
      * when calling InferenceEngine::InferRequest::SetBlob
      *
-     * @param inputs Reference to InferenceEngine::ConstInputsDataMap object.
      * @return A collection that contains string as key, and const InputInfo smart pointer as value
      */
     ConstInputsDataMap GetInputsInfo() const;
diff --git a/inference-engine/include/cpp/ie_infer_request.hpp b/inference-engine/include/cpp/ie_infer_request.hpp
index c5d52ec6fc0..94393cea062 100644
--- a/inference-engine/include/cpp/ie_infer_request.hpp
+++ b/inference-engine/include/cpp/ie_infer_request.hpp
@@ -235,6 +235,9 @@ public:
     bool operator==(const InferRequest&) const noexcept;
 };
 
+/**
+ * @private
+ */
 template<>
 struct InferRequest::SetCallback<std::function<void(InferRequest, StatusCode)>> {
     void operator()(std::function<void(InferRequest, StatusCode)> f) {
@@ -245,6 +248,9 @@ struct InferRequest::SetCallback<std::function<void(InferRequest, StatusCode)>>
 
 IE_SUPPRESS_DEPRECATED_START
 
+/**
+ * @private
+ */
 template<>
 struct InferRequest::SetCallback<IInferRequest::CompletionCallback> {
     void operator()(IInferRequest::CompletionCallback f) {
diff --git a/inference-engine/include/gpu/gpu_context_api_dx.hpp b/inference-engine/include/gpu/gpu_context_api_dx.hpp
index 5f94db0595d..1a529e56c78 100644
--- a/inference-engine/include/gpu/gpu_context_api_dx.hpp
+++ b/inference-engine/include/gpu/gpu_context_api_dx.hpp
@@ -106,8 +106,8 @@ public:
     }
 
     /**
-     * @brief Returns plane ID of underlying video decoder surface,
-     * or 0 if no video surface was shared.
+     * @brief Returns plane ID of underlying video decoder surface, or 0 if no video surface was shared.
+     * @return Plane ID
      */
     uint32_t plane() {
         return _ObjFromParams<uint32_t, uint32_t>(getParams(),
diff --git a/inference-engine/include/gpu/gpu_context_api_ocl.hpp b/inference-engine/include/gpu/gpu_context_api_ocl.hpp
index a39446b0368..357b58d163b 100644
--- a/inference-engine/include/gpu/gpu_context_api_ocl.hpp
+++ b/inference-engine/include/gpu/gpu_context_api_ocl.hpp
@@ -39,6 +39,7 @@ public:
 
     /**
      * @brief Returns the underlying OpenCL context handle.
+     * @return `cl_context`
      */
     cl_context get() {
         return _ObjFromParams<cl_context, gpu_handle_param>(getParams(), GPU_PARAM_KEY(OCL_CONTEXT),
@@ -47,7 +48,7 @@ public:
 
     /**
      * @brief OpenCL context handle conversion operator for the ClContext object.
-     * @return Underlying OpenCL context handle
+     * @return `cl_context`
      */
     operator cl_context() {
         return get();
@@ -55,7 +56,7 @@ public:
 
     /**
      * @brief Standard Khronos cl::Context wrapper conversion operator for the ClContext object.
-     * @return cl::Context object
+     * @return `cl::Context` object
      */
     operator cl::Context() {
         return cl::Context(get(), true);
@@ -101,6 +102,7 @@ public:
 
     /**
      * @brief Returns the underlying OpenCL memory object handle.
+     * @return underlying OpenCL memory object handle
      */
     cl_mem get() {
         return _ObjFromParams<cl_mem, gpu_handle_param>(getParams(), GPU_PARAM_KEY(MEM_HANDLE),
@@ -109,6 +111,7 @@ public:
 
     /**
      * @brief OpenCL memory handle conversion operator.
+     * @return `cl_mem`
      */
     operator cl_mem() {
         return get();
@@ -116,7 +119,7 @@ public:
 
     /**
      * @brief Standard Khronos cl::Buffer wrapper conversion operator.
-     * @return cl::Buffer object
+     * @return `cl::Buffer` object
      */
     operator cl::Buffer() {
         return cl::Buffer(get(), true);
@@ -144,6 +147,7 @@ public:
 
     /**
      * @brief Returns the underlying OpenCL memory object handle.
+     * @return `cl_mem`
      */
     cl_mem get() {
         return _ObjFromParams<cl_mem, gpu_handle_param>(getParams(), GPU_PARAM_KEY(MEM_HANDLE),
@@ -152,6 +156,7 @@ public:
 
     /**
      * @brief OpenCL memory handle conversion operator.
+     * @return `cl_mem`
      */
     operator cl_mem() {
         return get();
@@ -159,7 +164,7 @@ public:
 
     /**
      * @brief Standard Khronos cl::Image2D wrapper conversion operator for the ClContext object.
-     * @return cl::Image2D object
+     * @return `cl::Image2D` object
      */
     operator cl::Image2D() {
         return cl::Image2D(get(), true);
@@ -269,7 +274,7 @@ static inline Blob::Ptr make_shared_blob(const TensorDesc& desc, RemoteContext::
  * @brief This function is used to obtain remote blob object from user-supplied cl::Image2D wrapper object
  * @param desc A tensor descriptor object representing remote blob configuration
  * @param ctx A remote context used to create remote blob
- * @param buffer A cl::Image2D object wrapped by a remote blob
+ * @param image A cl::Image2D object wrapped by a remote blob
  * @return A remote blob instance
  */
 static inline Blob::Ptr make_shared_blob(const TensorDesc& desc, RemoteContext::Ptr ctx, cl::Image2D& image) {
diff --git a/inference-engine/include/gpu/gpu_context_api_va.hpp b/inference-engine/include/gpu/gpu_context_api_va.hpp
index 6e2bedacf62..93ba0fd7388 100644
--- a/inference-engine/include/gpu/gpu_context_api_va.hpp
+++ b/inference-engine/include/gpu/gpu_context_api_va.hpp
@@ -36,8 +36,8 @@ public:
     using Ptr = std::shared_ptr<VAContext>;
 
     /**
-     * @brief VADisplay conversion operator for the VAContext object.
-     * @return Underlying VADisplay object handle 
+     * @brief `VADisplay` conversion operator for the VAContext object.
+     * @return Underlying `VADisplay` object handle
      */
     operator VADisplay() {
         return _ObjFromParams<VADisplay, gpu_handle_param>(getParams(),
@@ -67,7 +67,7 @@ public:
 
     /**
      * @brief VASurfaceID conversion operator for the VASurfaceBlob object.
-     * @return VA surface handle 
+     * @return `VASurfaceID` handle
      */
     operator VASurfaceID() {
         return _ObjFromParams<VASurfaceID, uint32_t>(getParams(),
@@ -77,6 +77,7 @@ public:
 
     /**
      * @brief Returns plane ID of underlying video decoder surface
+     * @return Plane ID
      */
     uint32_t plane() {
         return _ObjFromParams<uint32_t, uint32_t>(getParams(),
@@ -86,11 +87,16 @@ public:
 };
 
 /**
-* @brief This function is used to obtain a NV12 compound blob object from NV12 VA decoder output.
-* The resulting compound contains two remote blobs for Y and UV planes of the surface.
-*/
+ * @brief This function is used to obtain a NV12 compound blob object from NV12 VA decoder output.
+ * The resulting compound contains two remote blobs for Y and UV planes of the surface.
+ * @param height A height of Y plane
+ * @param width A width of Y plane
+ * @param ctx A remote context instance
+ * @param nv12_surf NV12 `VASurfaceID` to create NV12 from
+ * @return A remote NV12 blob wrapping `VASurfaceID`
+ */
 static inline Blob::Ptr make_shared_blob_nv12(size_t height, size_t width, RemoteContext::Ptr ctx, VASurfaceID nv12_surf) {
-    // despite of layout, blob dimensions always follow in N,C,H,W order
+    // despite of layout, blob dimensions always follow in N, C, H, W order
     TensorDesc ydesc(Precision::U8, { 1, 1, height, width }, Layout::NHWC);
     ParamMap blobParams = {
         { GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(VA_SURFACE) },
@@ -107,8 +113,12 @@ static inline Blob::Ptr make_shared_blob_nv12(size_t height, size_t width, Remot
 }
 
 /**
-* @brief This function is used to obtain remote context object from VA display handle
-*/
+ * @brief This function is used to obtain remote context object from VA display handle
+ * @param core Inference Engine Core object
+ * @param deviceName A device name to create a remote context for
+ * @param device A `VADisplay` to create remote context from
+ * @return A remote context wrapping `VADisplay`
+ */
 static inline VAContext::Ptr make_shared_context(Core& core, std::string deviceName, VADisplay device) {
     ParamMap contextParams = {
         { GPU_PARAM_KEY(CONTEXT_TYPE), GPU_PARAM_VALUE(VA_SHARED) },
@@ -118,8 +128,13 @@ static inline VAContext::Ptr make_shared_context(Core& core, std::string deviceN
 }
 
 /**
-* @brief This function is used to obtain remote blob object from VA surface handle
-*/
+ * @brief This function is used to obtain remote blob object from VA surface handle
+ * @param desc Tensor descriptor
+ * @param ctx A remote context instance
+ * @param surface A `VASurfaceID` to create remote blob from
+ * @param plane An index of a plane inside `VASurfaceID` to create blob from
+ * @return A remote blob wrapping `VASurfaceID`
+ */
 static inline VASurfaceBlob::Ptr make_shared_blob(const TensorDesc& desc, RemoteContext::Ptr ctx, VASurfaceID surface, uint32_t plane = 0) {
     auto casted = std::dynamic_pointer_cast<VAContext>(ctx);
     if (nullptr == casted) {
diff --git a/inference-engine/include/ie_blob.h b/inference-engine/include/ie_blob.h
index 3463ec18218..4e1d2cd873a 100644
--- a/inference-engine/include/ie_blob.h
+++ b/inference-engine/include/ie_blob.h
@@ -304,6 +304,7 @@ public:
 
     /**
      * @brief Returns the tensor description
+     * @return A tensor description
      */
     const TensorDesc& getTensorDesc() const noexcept override {
         return tensorDesc;
@@ -311,6 +312,7 @@ public:
 
     /**
      * @brief Returns the tensor description
+     * @return A tensor description
      */
     TensorDesc& getTensorDesc() noexcept override {
         return tensorDesc;
@@ -395,7 +397,7 @@ public:
      *
      * @return A LockedMemory object
      */
-    virtual LockedMemory<void> rwmap()noexcept = 0;
+    virtual LockedMemory<void> rwmap() noexcept = 0;
 
     /**
      * @brief Gets read only access to the memory in virtual space of the process.
@@ -419,7 +421,7 @@ public:
      *
      * @return A LockedMemory object
      */
-    virtual LockedMemory<const void> rmap()const noexcept = 0;
+    virtual LockedMemory<const void> rmap() const noexcept = 0;
 
     /**
      * @brief Gets "write only direction" access to the memory in virtual space of the process.
@@ -446,7 +448,7 @@ public:
      *
      * @return A LockedMemory object
      */
-    virtual LockedMemory<void> wmap()noexcept = 0;
+    virtual LockedMemory<void> wmap() noexcept = 0;
 
 protected:
     /**
@@ -567,11 +569,6 @@ public:
      */
     virtual ~TBlob();
 
-    /**
-     * @brief Gets the size of the given type.
-     *
-     * @return Size of the type
-     */
     size_t element_size() const noexcept override {
         return sizeof(T);
     }
@@ -594,9 +591,6 @@ public:
         return std::move(lockme<const T>());
     }
 
-    /**
-     * @brief Allocates or reallocates memory
-     */
     void allocate() noexcept override {
         const auto allocator = getAllocator();
         const auto rawHandle = allocator->alloc(byteSize());
@@ -612,27 +606,14 @@ public:
             });
     }
 
-    /**
-     * @brief Frees all allocated data
-     */
     bool deallocate() noexcept override {
         return free();
     }
 
-    /**
-     * @brief Creates a new LockedMemory instance holding void pointer.
-     *
-     * @return LockedMemory instance holding void pointer
-     */
     LockedMemory<void> buffer() noexcept override {
         return std::move(lockme<void>());
     }
 
-    /**
-     * @brief Creates a new LockedMemory instance holding constant void pointer.
-     *
-     * @return LockedMemory instance holding constant void pointer
-     */
     LockedMemory<const void> cbuffer() const noexcept override {
         return std::move(lockme<const void>());
     }
@@ -734,6 +715,7 @@ protected:
 
     /**
      * @brief Frees handler and cleans up the stored data.
+     * @return `true` if memory was freed
      */
     virtual bool free() {
         bool bCanRelease = _handle != nullptr;
@@ -753,11 +735,6 @@ protected:
          //   getTensorDesc().getBlockingDesc().getOffsetPadding());
     }
 
-    /**
-     * @brief Gets an allocator or creates a default one.
-     *
-     * @return IAllocator instance
-     */
     const std::shared_ptr<IAllocator>& getAllocator() const noexcept override {
         // in case when constructor without allocator was used
         if (!_allocator) {
@@ -767,9 +744,6 @@ protected:
         return _allocator;
     }
 
-    /**
-     * @brief Returns handle to the stored data.
-     */
     void* getHandle() const noexcept override {
         return _handle.get();
     }
diff --git a/inference-engine/include/ie_compound_blob.h b/inference-engine/include/ie_compound_blob.h
index 70889b9937d..8a0aae67c23 100644
--- a/inference-engine/include/ie_compound_blob.h
+++ b/inference-engine/include/ie_compound_blob.h
@@ -73,16 +73,19 @@ public:
 
     /**
      * @brief Always returns an empty LockedMemory object
+     * @return Empty locked memory
      */
     LockedMemory<void> buffer() noexcept override;
 
     /**
      * @brief Always returns an empty LockedMemory object
+     * @return Empty locked memory
      */
     LockedMemory<const void> cbuffer() const noexcept override;
 
     /**
      * @brief Returns the number of underlying blobs in the compound blob
+     * @return A number of underlying blobs
      */
     size_t size() const noexcept override;
 
@@ -109,9 +112,6 @@ protected:
      */
     std::vector<Blob::Ptr> _blobs;
 
-    /**
-     * @brief Returns nullptr as CompoundBlob is not allocator-based
-     */
     const std::shared_ptr<IAllocator>& getAllocator() const noexcept override;
 };
 
@@ -148,21 +148,25 @@ public:
 
     /**
      * @brief Returns a shared pointer to Y plane
+     * @return Y plane
      */
     virtual Blob::Ptr& y() noexcept;
 
     /**
      * @brief Returns a shared pointer to Y plane
+     * @return Y plane
      */
     virtual const Blob::Ptr& y() const noexcept;
 
     /**
      * @brief Returns a shared pointer to UV plane
+     * @return UV plane
      */
     virtual Blob::Ptr& uv() noexcept;
 
     /**
      * @brief Returns a shared pointer to UV plane
+     * @return UV plane
      */
     virtual const Blob::Ptr& uv() const noexcept;
 
diff --git a/inference-engine/include/ie_data.h b/inference-engine/include/ie_data.h
index 618c1f86d0a..f3c83720aea 100644
--- a/inference-engine/include/ie_data.h
+++ b/inference-engine/include/ie_data.h
@@ -93,6 +93,7 @@ public:
 
     /**
      * @brief Gets the layout value for this Data instance
+     * @return Layout
      */
     Layout getLayout() const;
 
diff --git a/inference-engine/include/ie_plugin_config.hpp b/inference-engine/include/ie_plugin_config.hpp
index 6cbd1e29ae9..9c2dd68e46f 100644
--- a/inference-engine/include/ie_plugin_config.hpp
+++ b/inference-engine/include/ie_plugin_config.hpp
@@ -264,9 +264,9 @@ DECLARE_CONFIG_VALUE(HYBRID_AWARE);
  *   (and what is the optimal number of streams)
  * - finally, specifying the positive integer value creates the requested number of streams
  */
+DECLARE_CONFIG_KEY(CPU_THROUGHPUT_STREAMS);
 DECLARE_CONFIG_VALUE(CPU_THROUGHPUT_NUMA);
 DECLARE_CONFIG_VALUE(CPU_THROUGHPUT_AUTO);
-DECLARE_CONFIG_KEY(CPU_THROUGHPUT_STREAMS);
 
 /**
  * @brief The name for setting performance counters option.
diff --git a/inference-engine/include/ie_precision.hpp b/inference-engine/include/ie_precision.hpp
index e72a8248f00..e632d177235 100644
--- a/inference-engine/include/ie_precision.hpp
+++ b/inference-engine/include/ie_precision.hpp
@@ -91,13 +91,21 @@ public:
         precisionInfo.value = CUSTOM;
     }
 
-    /** @brief Creates custom precision with specific underlined type */
+    /**
+     * @brief Creates custom precision with specific underlined type
+     * @param typeName A string name of precision
+     * @return Precision converted from string name
+     */
     template <class T>
     static Precision fromType(const char* typeName = nullptr) {
         return Precision(8 * sizeof(T), typeName == nullptr ? typeid(T).name() : typeName);
     }
 
-    /** @brief checks whether given storage class T can be used to store objects of current precision */
+    /**
+     * @brief checks whether given storage class T can be used to store objects of current precision
+     * @param typeName A string name of precision
+     * @return `true` if `typeName` has underlaying storage type
+     */
     template <class T>
     bool hasStorageType(const char* typeName = nullptr) const noexcept {
         try {
diff --git a/inference-engine/include/ie_remote_context.hpp b/inference-engine/include/ie_remote_context.hpp
index 376459b4a9d..960452b12c1 100644
--- a/inference-engine/include/ie_remote_context.hpp
+++ b/inference-engine/include/ie_remote_context.hpp
@@ -46,9 +46,6 @@ public:
      */
     explicit RemoteBlob(const TensorDesc& tensorDesc): MemoryBlob(tensorDesc) {}
 
-    /**
-    * @brief Returns the number of bytes per element.
-    */
     size_t element_size() const noexcept override {
         return tensorDesc.getPrecision().size();
     }
diff --git a/inference-engine/samples/common/format_reader/CMakeLists.txt b/inference-engine/samples/common/format_reader/CMakeLists.txt
index eb6c7cbe7e0..f4a3dfd0c7d 100644
--- a/inference-engine/samples/common/format_reader/CMakeLists.txt
+++ b/inference-engine/samples/common/format_reader/CMakeLists.txt
@@ -13,7 +13,7 @@ source_group("src" FILES ${LIBRARY_SRC})
 source_group("include" FILES ${LIBRARY_HEADERS})
 
 # Create library file from sources.
-add_library(${TARGET_NAME} SHARED EXCLUDE_FROM_ALL ${MAIN_SRC} ${LIBRARY_HEADERS})
+add_library(${TARGET_NAME} SHARED ${MAIN_SRC} ${LIBRARY_HEADERS})
 
 # Find OpenCV components if exist
 find_package(OpenCV COMPONENTS core imgproc imgcodecs QUIET)
@@ -39,4 +39,4 @@ set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}
 
 if(COMMAND add_clang_format_target)
     add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
-endif()
\ No newline at end of file
+endif()
diff --git a/inference-engine/src/auto_plugin/auto_infer_request.cpp b/inference-engine/src/auto_plugin/auto_infer_request.cpp
index 9ed24a49ed8..1497ee3557b 100644
--- a/inference-engine/src/auto_plugin/auto_infer_request.cpp
+++ b/inference-engine/src/auto_plugin/auto_infer_request.cpp
@@ -78,7 +78,6 @@ void AutoInferRequest::HotSwapRequests() {
         InferenceEngine::SoExecutableNetworkInternal tempSoExecNetwork;
         if (_autoExecutableNetwork->TryGetActualNetwork(tempSoExecNetwork)) {
             _alreadyActualNetwork = true;
-            std::cout << "!!! DEBUG: HotSwapRequests !!!" << std::endl;
             _inferRequest = {tempSoExecNetwork, tempSoExecNetwork->CreateInferRequest()};
             _inferRequest->SetCallback(_callback);
         }
diff --git a/inference-engine/src/auto_plugin/auto_plugin.cpp b/inference-engine/src/auto_plugin/auto_plugin.cpp
index 1b5bb70f8f6..94b6a8a8b71 100644
--- a/inference-engine/src/auto_plugin/auto_plugin.cpp
+++ b/inference-engine/src/auto_plugin/auto_plugin.cpp
@@ -84,14 +84,11 @@ std::shared_ptr<AutoExecutableNetwork> AutoInferencePlugin::LoadNetworkImpl(cons
         [core, modelPath, network](const std::string& device)
             -> IE::SoExecutableNetworkInternal {
             IE::SoExecutableNetworkInternal executableNetwork;
-            std::cout << "!!! DEBUG: Starting Async loading to the " << device <<  " !!!" << std::endl;
-            std::cout << "!!! DEBUG: device full name: " << core->GetMetric(device, METRIC_KEY(FULL_DEVICE_NAME)).as<std::string>() << std::endl;
             if (!modelPath.empty()) {
                 executableNetwork = core->LoadNetwork(modelPath, device, {});
             } else {
                 executableNetwork = core->LoadNetwork(network, device, {});
             }
-            std::cout << "!!! DEBUG: " << device << " was loaded !!!" << std::endl;
             return executableNetwork;
         };
 
diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
index 5b3b90eb832..7849101cc2a 100644
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -61,6 +61,7 @@
 #include <transformations/op_conversions/convert_nms_to_nms_ie_internal.hpp>
 #include <transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp>
 #include <transformations/op_conversions/convert_gather_0d.hpp>
+#include <transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp>
 #include <transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp>
 #include <transformations/convert_precision.hpp>
 #include <transformations/init_node_info.hpp>
@@ -191,6 +192,7 @@ InferenceEngine::CNNNetwork clDNNEngine::CloneAndTransformNetwork(const Inferenc
             manager.register_pass<ngraph::pass::ConvertNMS4ToNMS5>();
             manager.register_pass<ngraph::pass::ConvertNMSToNMSIEInternal>();
             manager.register_pass<ngraph::pass::ConvertGather0D>();
+            manager.register_pass<ngraph::pass::ConvertDeformableConv8To1>();
 
             static const precisions_array convert_precision_list {
                     {ngraph::element::i64, ngraph::element::i32},
diff --git a/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp b/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
index c0445c65072..e562447189b 100644
--- a/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
+++ b/inference-engine/src/cldnn_engine/cldnn_primitives_list.hpp
@@ -208,5 +208,8 @@ REGISTER_FACTORY(v6, MVN);
 // ------------------------------ Supported v7 ops ------------------------------ //
 REGISTER_FACTORY(v7, Gather);
 
+// ------------------------------ Supported v8 ops ------------------------------ //
+REGISTER_FACTORY(v8, Gather);
+
 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
diff --git a/inference-engine/src/cldnn_engine/ops/gather.cpp b/inference-engine/src/cldnn_engine/ops/gather.cpp
index 362854cc32a..d22258e0673 100644
--- a/inference-engine/src/cldnn_engine/ops/gather.cpp
+++ b/inference-engine/src/cldnn_engine/ops/gather.cpp
@@ -57,51 +57,8 @@ static cldnn::gather::gather_axis GetGatherAxis(int32_t axis, cldnn::format inpu
     }
 }
 
-void CreateGatherOp(Program& p, const std::shared_ptr<ngraph::op::v1::Gather>& op) {
-    p.ValidateInputs(op, {2, 3});
-    auto inputPrimitives = p.GetInputPrimitiveIDs(op);
-    std::string layerName = layer_type_name_ID(op);
-
-    int32_t axis = static_cast<int32_t>(op->get_axis());
-
-    std::vector<cldnn::primitive_id> reorderedInputs;
-    reorderedInputs.resize(inputPrimitives.size());
-
-    for (size_t portIndex = 0; portIndex < inputPrimitives.size(); portIndex++) {
-        auto inputDataType = DataTypeFromPrecision(op->get_input_element_type(portIndex));
-        if (inputDataType == cldnn::data_types::i64) {
-            // clDNN primitive does not support i64 inputs,
-            // so we need additional reorders to convert them to i32
-            auto reorderPrimName = inputPrimitives[portIndex] + "_" + op->get_friendly_name() + Program::m_preProcessTag;
-            auto targetFormat = DefaultFormatForDims(op->get_input_shape(portIndex).size());
-            auto preprocessPrim = cldnn::reorder(reorderPrimName,
-                                                 inputPrimitives[portIndex],
-                                                 targetFormat,
-                                                 cldnn::data_types::i32);
-            p.AddPrimitive(preprocessPrim);
-            p.AddInnerPrimitiveToProfiler(reorderPrimName, layerName, op);
-            reorderedInputs[portIndex] = reorderPrimName;
-        } else {
-            reorderedInputs[portIndex] = inputPrimitives[portIndex];
-        }
-    }
-
-    auto outLayout = DefaultFormatForDims(op->get_output_shape(0).size());
-    auto gatherPrim = cldnn::gather(layerName,
-                                    reorderedInputs[0],
-                                    reorderedInputs[1],
-                                    GetGatherAxis(axis, DefaultFormatForDims(op->get_input_shape(0).size())),
-                                    outLayout,
-                                    CldnnTensorFromIEDims(op->get_output_shape(0)));
-
-    p.AddPrimitive(gatherPrim);
-    p.AddPrimitiveToProfiler(op);
-}
-
-REGISTER_FACTORY_IMPL(v1, Gather);
-
-void CreateGatherOp(Program& p, const std::shared_ptr<ngraph::op::v7::Gather>& op) {
-    p.ValidateInputs(op, {2, 3, 4});
+template <typename T>
+void CreateGatherOpBase(Program& p, const std::shared_ptr<T>& op, const int64_t batch_dim = 0, bool support_neg_ind = false) {
     auto inputPrimitives = p.GetInputPrimitiveIDs(op);
     std::string layerName = layer_type_name_ID(op);
 
@@ -136,11 +93,32 @@ void CreateGatherOp(Program& p, const std::shared_ptr<ngraph::op::v7::Gather>& o
                                     GetGatherAxis(axis, DefaultFormatForDims(op->get_input_shape(0).size())),
                                     outLayout,
                                     CldnnTensorFromIEDims(op->get_output_shape(0)),
-                                    op->get_batch_dims());
+                                    batch_dim,
+                                    support_neg_ind);
 
     p.AddPrimitive(gatherPrim);
     p.AddPrimitiveToProfiler(op);
 }
 
+void CreateGatherOp(Program& p, const std::shared_ptr<ngraph::op::v1::Gather>& op) {
+    p.ValidateInputs(op, {2, 3});
+    CreateGatherOpBase<ngraph::op::v1::Gather>(p, op);
+}
+
+REGISTER_FACTORY_IMPL(v1, Gather);
+
+void CreateGatherOp(Program& p, const std::shared_ptr<ngraph::op::v7::Gather>& op) {
+    p.ValidateInputs(op, {2, 3, 4});
+    CreateGatherOpBase<ngraph::op::v7::Gather>(p, op, op->get_batch_dims());
+}
+
 REGISTER_FACTORY_IMPL(v7, Gather);
+
+void CreateGatherOp(Program& p, const std::shared_ptr<ngraph::op::v8::Gather>& op) {
+    p.ValidateInputs(op, {2, 3, 4});
+    CreateGatherOpBase<ngraph::op::v8::Gather>(p, op, op->get_batch_dims(), true);
+}
+
+REGISTER_FACTORY_IMPL(v8, Gather);
+
 }  // namespace CLDNNPlugin
diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
index 6b7774ce402..a3a64c1605b 100644
--- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
+++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
@@ -25,6 +25,7 @@
 #include "dnn_types.h"
 #include "gna_types.h"
 #include "gna_limitations.hpp"
+#include "layers/gna_convolution_layer.hpp"
 
 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
@@ -50,6 +51,9 @@
 
 using namespace GNAPluginNS::backend;
 
+using GNAPluginNS::GNAConvolutionLayer::outputFromConv;
+using GNAPluginNS::GNAConvolutionLayer::outputFromPooling;
+using GNAPluginNS::GNAConvolutionLayer::outputFromPoolingLegacy;
 
 void GNAPluginNS::backend::AMIntelDNN::BeginNewWrite(uint32_t index) {
     dump_write_index = index;
@@ -152,8 +156,7 @@ void GNAPluginNS::backend::AMIntelDNN::InitConvolutional1DComponentPrivate(intel
                                                  uint32_t num_bytes_per_bias,
                                                  uint32_t num_filters,
                                                  uint32_t num_filter_coefficients,
-                                                 uint32_t num_feature_map_rows,
-                                                 uint32_t num_feature_map_columns,
+                                                 const uint32_t convStride,
                                                  float weight_scale_factor,
                                                  float output_scale_factor,
                                                  void *&ptr_inputs,
@@ -177,8 +180,7 @@ void GNAPluginNS::backend::AMIntelDNN::InitConvolutional1DComponentPrivate(intel
     comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias;
     comp.op.conv1D.num_filters = num_filters;
     comp.op.conv1D.num_filter_coefficients = num_filter_coefficients;
-    comp.op.conv1D.num_feature_map_rows = num_feature_map_rows;
-    comp.op.conv1D.num_feature_map_columns = num_feature_map_columns;
+    comp.op.conv1D.convStride = convStride;
     comp.op.conv1D.weight_scale_factor = weight_scale_factor;
     comp.output_scale_factor = output_scale_factor;
     comp.input_scale_factor = output_scale_factor / weight_scale_factor;
@@ -195,18 +197,17 @@ void GNAPluginNS::backend::AMIntelDNN::InitConvolutional1DComponentPrivate(intel
         ptr_outputs = &comp.ptr_outputs;
     }
 
-    if (comp.num_columns_in % 8 != 0) {
-        THROW_GNA_EXCEPTION << "Number of inputs to Convolutional1DComponent (" << comp.num_columns_in <<
+    if (num_columns_in % 8 != 0) {
+        THROW_GNA_EXCEPTION << "Number of inputs to Convolutional1DComponent (" << num_columns_in <<
                                ") is not a multiply by 8";
     }
-    if (comp.op.conv1D.num_filters < GNALimitations::convMinFiltersNum ||
-        comp.op.conv1D.num_filters > GNALimitations::convMaxFiltersNum ||
-        comp.op.conv1D.num_filters % GNALimitations::convFiltersNumDivider != 0) {
-        THROW_GNA_EXCEPTION << "Unsupported number of filters in Convolutional1DComponent: " << comp.op.conv1D.num_filters;
+    if (num_filters < GNALimitations::convMinFiltersNum ||
+        num_filters > GNALimitations::convMaxFiltersNum ||
+        num_filters % GNALimitations::convFiltersNumDivider != 0) {
+        THROW_GNA_EXCEPTION << "Unsupported number of filters in Convolutional1DComponent: " << num_filters;
     }
-    auto filter_stride_size = comp.op.conv1D.num_feature_map_columns;
-    auto max_number_of_out_elements = (comp.num_columns_in - comp.op.conv1D.num_filter_coefficients) / filter_stride_size + 1;
-    if (comp.num_columns_out / max_number_of_out_elements != comp.op.conv1D.num_filters) {
+    auto max_number_of_out_elements = outputFromConv(num_columns_in, num_filter_coefficients, convStride);
+    if (num_columns_out / max_number_of_out_elements != num_filters) {
         THROW_GNA_EXCEPTION << "Number of outputs or feature map config is incorrect in Convolutional1DComponent";
     }
 }
@@ -538,8 +539,7 @@ void GNAPluginNS::backend::AMIntelDNN::WriteGraphWizModel(const char *filename)
             auto &conv = components[k].op.conv1D;
             graph << "  <TR><TD> num_filters</TD><TD>" <<  conv.num_filters<< "</TD></TR>\n";
             graph << "  <TR><TD> num_filter_coefficients</TD><TD>" <<  conv.num_filter_coefficients<< "</TD></TR>\n";
-            graph << "  <TR><TD> num_feature_map_rows</TD><TD>" <<  conv.num_feature_map_rows<< "</TD></TR>\n";
-            graph << "  <TR><TD> num_feature_map_columns</TD><TD>" <<  conv.num_feature_map_columns<< "</TD></TR>\n";
+            graph << "  <TR><TD> conv_stride</TD><TD>" <<  conv.convStride<< "</TD></TR>\n";
             graph << "  <TR><TD> wscale</TD><TD>" <<  conv.weight_scale_factor<< "</TD></TR>\n";
             graph << "  <TR><TD> wbit</TD><TD>" <<  conv.num_bytes_per_weight<< "</TD></TR>\n";
             graph << "  <TR><TD> bbit</TD><TD>" <<  conv.num_bytes_per_bias<< "</TD></TR>\n";
@@ -936,16 +936,14 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
                 case kDnnConvolutional1dOp: {
                     uint32_t num_filters = component[i].op.conv1D.num_filters;
                     uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients;
-                    uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows;
-                    uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns;
+                    const auto convStride = component[i].op.conv1D.convStride;
                     uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight;
                     uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias;
                     float weight_scale_factor = component[i].op.conv1D.weight_scale_factor;
                     float output_scale_factor = component[i].output_scale_factor;
                     out_file << "<num_filters> " << std::dec << num_filters << "\n";
                     out_file << "<num_filter_coefficients> " << std::dec << num_filter_coefficients << "\n";
-                    out_file << "<num_feature_map_rows> " << std::dec << num_feature_map_rows << "\n";
-                    out_file << "<num_feature_map_columns> " << std::dec << num_feature_map_columns << "\n";
+                    out_file << "<conv_stride> " << std::dec << convStride << "\n";
                     if ((compute_precision_ == kDnnInt) && (logging_precision == kDnnFloat)) {
                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
@@ -1362,35 +1360,6 @@ uint32_t GNAPluginNS::backend::AMIntelDNN::CountLayers() {
     return n;
 }
 
-namespace {
-uint32_t outputFromConv(const uint32_t in, const uint32_t flt, const uint32_t stride) {
-    // floor[(in - flt)/stride] + 1, GNA Spec 1.24
-    if (flt > in || flt == 0 || stride == 0) {
-        THROW_GNA_EXCEPTION << "Invalid (input, filter, stride) = (" << in << "," << flt << "," << stride << ")";
-    }
-    return (in - flt) / stride + 1;
-}
-
-uint32_t outputFromPooling(const uint32_t in, const uint32_t window, const uint32_t stride) {
-    // ceil[(in - window)/stride] + 1, GNA Spec 1.24
-    if (window > in || window == 0 || stride == 0) {
-        THROW_GNA_EXCEPTION << "Invalid (input, window, stride) = (" << in << "," << window << "," << stride << ")";
-    }
-    if (window == in) return 1;
-
-    return (in - window - 1) / stride + 2;
-}
-
-uint32_t outputFromPoolingLegacy(const uint32_t in, const uint32_t stride) {
-    // floor[(in - 1)/stride] + 1, GNA 1.0/2.0 HW Spec
-    if (in == 0 || stride == 0) {
-        THROW_GNA_EXCEPTION << "Invalid (input, stride) = (" << in << "," << stride << ")";
-    }
-    return (in - 1) / stride + 1;
-}
-
-} // namespace
-
 #if GNA_LIB_VER == 2
 void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(Gna2Model *gnaModel) {
     Gna2Operation * gnaOperation;
@@ -1593,7 +1562,7 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
                                 comp.op.conv1D.ptr_biases),
                         nullptr,
                         create_shape1D_parameter(
-                                comp.op.conv1D.num_feature_map_columns),
+                                comp.op.conv1D.convStride),
                         nullptr,
                         nullptr);
 
@@ -1619,11 +1588,11 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
                     pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias;
                     pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight;
                     pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters;
-                    pConvolutionalLayer->nFilterRows = comp.op.conv1D.num_filter_coefficients / comp.op.conv1D.num_feature_map_columns;
+                    pConvolutionalLayer->nFilterRows = comp.op.conv1D.num_filter_coefficients / comp.op.conv1D.convStride;
                     pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients;
                     pConvolutionalLayer->nFeatureMaps = 1;
-                    pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows;
-                    pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns;
+                    pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.convStride;
+                    pConvolutionalLayer->nFeatureMapRows = pLayer->nInputColumns / pConvolutionalLayer->nFeatureMapColumns;
                     pConvolutionalLayer->poolType = INTEL_NO_POOLING;  //  will be overwritten
                     pConvolutionalLayer->nPoolSize = 0;  //  will be overwritten
                     pConvolutionalLayer->nPoolStride = 0;  //  will be overwritten
@@ -1750,8 +1719,7 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
                         auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns;  // always move 1 "row"
                         auto outFromConv = outputFromConv(pLayer->nInputColumns, nFltSize, fltStrideSz);
                         // FLAT input matrix, pooled outputs per filter
-                        // TODO: Issue 50386 check why (outFromConv - 1) an not (outFromConv - nPoolSize)
-                        pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((outFromConv - 1) / pConvolutionalLayer->nPoolStride + 1);
+                        pLayer->nOutputColumns = pConvolutionalLayer->nFilters * outputFromPoolingLegacy(outFromConv, pConvolutionalLayer->nPoolStride);
                     }
 #endif
                 } else {
diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp
index c4020a2bec9..7dcab94a685 100644
--- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp
+++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp
@@ -97,8 +97,7 @@ public:
                                              uint32_t num_bytes_per_bias,
                                              uint32_t num_filters,
                                              uint32_t num_filter_coefficients,
-                                             uint32_t num_feature_map_rows,
-                                             uint32_t num_feature_map_columns,
+                                             uint32_t convStride,
                                              float weight_scale_factor,
                                              float output_scale_factor,
                                              A *&ptr_inputs,
@@ -114,8 +113,7 @@ public:
                                             num_bytes_per_bias,
                                             num_filters,
                                             num_filter_coefficients,
-                                            num_feature_map_rows,
-                                            num_feature_map_columns,
+                                            convStride,
                                             weight_scale_factor,
                                             output_scale_factor,
                                             (void *&) ptr_inputs,
@@ -428,8 +426,7 @@ private:
                                                     uint32_t num_bytes_per_bias,
                                                     uint32_t num_filters,
                                                     uint32_t num_filter_coefficients,
-                                                    uint32_t num_feature_map_rows,
-                                                    uint32_t num_feature_map_columns,
+                                                    uint32_t convStride,
                                                     float weight_scale_factor,
                                                     float output_scale_factor,
                                                     void *&ptr_inputs,
diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h
index fe1dbdf7839..d08d9346d35 100644
--- a/inference-engine/src/gna_plugin/backend/dnn_types.h
+++ b/inference-engine/src/gna_plugin/backend/dnn_types.h
@@ -146,8 +146,7 @@ typedef struct {
     uint32_t num_bytes_per_bias;
     uint32_t num_filters;
     uint32_t num_filter_coefficients;
-    uint32_t num_feature_map_rows;
-    uint32_t num_feature_map_columns;
+    uint32_t convStride;
     float weight_scale_factor;
     void *ptr_filters;     // filters stored one after the other
     void *ptr_biases;
diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
index 59dd0478cfa..90af0451929 100644
--- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
+++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
@@ -16,6 +16,7 @@ constexpr uint32_t bufferMaxSize = 65528;
 constexpr uint32_t convMinFiltersNum = 4;
 constexpr uint32_t convMaxFiltersNum = 65532;
 constexpr uint32_t convFiltersNumDivider = 4;
+constexpr uint32_t convFilterSizeDivider = 8;
 constexpr uint32_t convFilterMaxSize = 768;
 constexpr uint32_t convEachKernelByteAlignment = 16;
 constexpr uint32_t noOfInputsDivisor = 8;
diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
index 11f13a7a9ac..7fe08a571f8 100644
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -1138,7 +1138,7 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
 
             double weights_reducer = 1.0;
             auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
-            if (conv) {
+            if (conv && !LayerInfo(conv).isConvolutionFilter()) {
                 const auto inDepth = GetDataDimSize(conv->insData.front().lock(), InferenceEngine::DataDimName::C);
                 weights_reducer = GNAConvolutionLayer::getWeightsReducer(*conv);
                 weights_reducer *= MAX_VAL_2B_FEAT * scaleRange * inDepth / std::numeric_limits<int32_t>::max();
diff --git a/inference-engine/src/gna_plugin/gna2_model_debug_log.cpp b/inference-engine/src/gna_plugin/gna2_model_debug_log.cpp
index 4e2de69a351..7fe0e4d9b0b 100644
--- a/inference-engine/src/gna_plugin/gna2_model_debug_log.cpp
+++ b/inference-engine/src/gna_plugin/gna2_model_debug_log.cpp
@@ -390,6 +390,7 @@ void DumpGna2Model(const Gna2Model& gnaModel, const std::string dumpFolderNameGN
             dumpFile << "\tOperand " << j << " (" << GetOperandName(operation.Type, j) << ")"
                 << " type: " << GetOperandType(operand.Type) <<
                 " shape: " << GetSimpleString(operand.Shape) <<
+                " data: " << operand.Data <<
                 " layout: ";
 
             DumpCharArray(dumpFile, operand.Layout, GNA2_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS);
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index 53d3c98a432..51a429d9a33 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -162,7 +162,7 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer)
                         InferenceEngine::details::product(begin(dataOutput->getDims()),
                                                         end(dataOutput->getDims())) * dataOutput->getPrecision().size();
 
-                if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) {
+                if (LayerInfo(outFunctionalLayer.first).isConvolutionFilter()) {
                     size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset");
                     layerInfoItem.splitOutputLayers.emplace_back(
                         outFunctionalLayer.first,
@@ -351,37 +351,33 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
     }
 
     // have to pad input to let last kernel meets it's corresponding input
-    uint32_t num_inputs = in_width * in_channels;
+    const auto num_inputs = in_width * in_channels;
     uint32_t num_input_padding = ALIGN(num_inputs, 8) - num_inputs;
 
     //  convert to 2D and set GNA input feature map size
-    uint32_t effectiveStride = convolution._stride_x * convolution._stride_y;
+    auto convStride = convolution._stride_x * convolution._stride_y;
     if (convolution._stride_y != 1) {
-        effectiveStride = convolution._stride_x;
+        convStride = convolution._stride_x;
     } else if (in_width == 1 && convolution._stride_x != 1) {
-        effectiveStride = convolution._stride_y;
+        convStride = convolution._stride_y;
     }
-    uint32_t num_feature_map_columns = in_channels * effectiveStride;
-
-    uint32_t num_feature_map_rows = (in_channels * in_width) / num_feature_map_columns;
+    const auto effectiveStride = in_channels * convStride;
 
     uint32_t num_filters = convolution._out_depth;
     uint32_t num_filter_coefficients = single_conv_kernel_size + num_conv_kernel_padding;
     uint32_t num_columns_in = num_inputs + num_input_padding;
 
-    uint32_t num_columns_out = (((num_inputs - num_filter_coefficients) / num_feature_map_columns) + 1) * convolution._out_depth;
-    uint32_t num_columns_out_unpadded = (((num_inputs - single_conv_kernel_size) / num_feature_map_columns) + 1) * convolution._out_depth;
+    uint32_t num_columns_out = (((num_inputs - num_filter_coefficients) / effectiveStride) + 1) * convolution._out_depth;
+    uint32_t num_columns_out_unpadded = (((num_inputs - single_conv_kernel_size) / effectiveStride) + 1) * convolution._out_depth;
 
-    uint32_t original_num_feature_map_rows = num_feature_map_rows;
     uint32_t original_input_padding = num_input_padding;
     uint32_t additional_padding = 0;
 
     // if kernel padding to multiple of 8 will cause missed outputs, need to pad further
     while (num_columns_out < out_batch * out_channels * out_width) {
         num_input_padding = original_input_padding + additional_padding;
-        num_feature_map_rows = original_num_feature_map_rows + (num_input_padding) / num_feature_map_columns;
         num_columns_in = num_inputs + num_input_padding;
-        num_columns_out = (((num_inputs + num_input_padding - num_filter_coefficients) / num_feature_map_columns) + 1) * convolution._out_depth;
+        num_columns_out = (((num_inputs + num_input_padding - num_filter_coefficients) / effectiveStride) + 1) * convolution._out_depth;
         dnn->new_num_conv_columns = num_columns_out;
         additional_padding += 8;
     }
@@ -427,8 +423,7 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
         num_bytes_per_bias,
         num_filters,
         num_filter_coefficients,
-        num_feature_map_rows,
-        num_feature_map_columns,
+        effectiveStride,
         weight_scale_factor,
         output_scale_factor,
         ptr_inputs,
@@ -457,8 +452,8 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
         if (inputs->getLayout() != Layout::NHWC && LayerInfo(connectedInputLayer).isInput()) {
             //  Kaldi features are opposite orientation
             dnn->do_rotate_input = true;
-            dnn->num_rotate_rows = num_feature_map_columns;
-            dnn->num_rotate_columns = original_num_feature_map_rows;
+            dnn->num_rotate_rows = effectiveStride;
+            dnn->num_rotate_columns = num_inputs / effectiveStride;
         } else {
             dnn->do_rotate_input = false;
         }
@@ -559,20 +554,10 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
     const auto outputs = convolution.outData.front();
 
     // have to pad input to let last kernel meets it's corresponding input
-    uint32_t num_inputs = in_width * in_height * in_channels;
+    const auto num_inputs = in_width * in_height * in_channels;
     uint32_t num_input_padding = ALIGN(num_inputs, 8) - num_inputs;
 
-    //  convert to 2D and set GNA input feature map size
-    uint32_t num_feature_map_columns = in_channels * convolution._stride_x * convolution._stride_y;
-    if (in_height == 1 && convolution._stride_y != 1) {
-        num_feature_map_columns = in_channels * convolution._stride_x;
-    } else if (in_width == 1 && convolution._stride_x != 1) {
-        num_feature_map_columns = in_channels * convolution._stride_y;
-    }
-    uint32_t num_feature_map_rows = (in_channels * in_height * in_width) / num_feature_map_columns;
-
     const uint32_t filter_n = convolution._out_depth;
-    uint32_t original_num_feature_map_rows = num_feature_map_rows;
 
     // if kernel padding to multiple of 8 will cause missed outputs, need to pad further
     if (num_input_padding == 0) {
@@ -638,15 +623,17 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
     auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input;
 
     // TODO: convolution might be not the first layer in sorted order but connected via split for example - dont know how kaldi will handle that
-    if (!dnn->do_rotate_input) {
-        if (inputs->getLayout() != Layout::NHWC && LayerInfo(connectedInputLayer).isInput()) {
-            //  Kaldi features are opposite orientation
-            dnn->do_rotate_input = true;
-            dnn->num_rotate_rows = num_feature_map_columns;
-            dnn->num_rotate_columns = original_num_feature_map_rows;
-        } else {
-            dnn->do_rotate_input = false;
+    if (!dnn->do_rotate_input && inputs->getLayout() != Layout::NHWC && LayerInfo(connectedInputLayer).isInput()) {
+        //  Kaldi features are opposite orientation
+        dnn->do_rotate_input = true;
+        dnn->num_rotate_rows = in_channels;
+        if (in_height != 1) {
+            dnn->num_rotate_rows *= convolution._stride_y;
         }
+        if (in_width != 1) {
+            dnn->num_rotate_rows *= convolution._stride_x;
+        }
+        dnn->num_rotate_columns = num_inputs / dnn->num_rotate_rows;
     }
 
     connectOutput(layer, ptr_outputs, num_data_bytes_out);
@@ -654,7 +641,7 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
     const auto kernelHW = convolution._kernel_y * convolution._kernel_x;
 
     std::vector<uint8_t> transposedWeights;
-    const auto singleKernelSize = in_channels* kernelHW* convolution.precision.size();
+    const auto singleKernelSize = in_channels* kernelHW * convolution.precision.size();
     const auto kernelPad = Gna2RoundUp(singleKernelSize, 16) - singleKernelSize;
     for (uint32_t k = 0; k < convolution._out_depth; k++) {
         uint8_t* ptr_filt_current
@@ -1728,8 +1715,8 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
     }
 }
 
-void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer) {
-    auto filterLayer = dynamic_cast<InferenceEngine::WeightableLayer*> (layer.get());
+void GNAGraphCompiler::ConvolutionFilterPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto filterLayer = dynamic_cast<InferenceEngine::ConvolutionLayer*> (layer.get());
 
     if (filterLayer == nullptr) {
         return;
@@ -1752,62 +1739,57 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
     auto outputs = *layer->outData.begin();
     auto inputs = layer->insData.begin()->lock();
 
-    const uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ?
+    const auto noOfInputsDivisor = gnaFlags->input_low_precision ?
         GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor;
-    uint32_t num_columns_in = GetDataDimSize(inputs, 2);
-    uint32_t num_rows_out = GetDataDimSize(outputs, 1);
-    uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
+    const uint32_t orginalInputSize = GetDataDimSize(inputs, 1);
+    const uint32_t orginalOutputSize = GetDataDimSize(outputs, 1);
+    if (orginalInputSize != orginalOutputSize) {
+        THROW_GNA_LAYER_EXCEPTION(filterLayer) << "Number in inputs (" << orginalInputSize <<
+            ") should be equal to number of outputs (" << orginalOutputSize << ")!";
+    }
+    const auto numberOfFilters = filterLayer->_out_depth;
+    const auto convolutionStride = numberOfFilters;
+    const auto filterWidth = filterLayer->_kernel_x;
+    const auto minOutputsPerFilter = ALIGN(orginalOutputSize, numberOfFilters) / numberOfFilters;
+    const auto minInputsNeeded = (minOutputsPerFilter - 1) * convolutionStride + filterWidth;
+    const auto numInputsFullyPadedAndAligned = ALIGN(minInputsNeeded, noOfInputsDivisor);
 
-    uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in;
-    auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->getTensorDesc().getPrecision() : outputs->getPrecision();
+    auto numOutputs = GNAConvolutionLayer::outputFromConv(numInputsFullyPadedAndAligned, filterWidth, convolutionStride);
+    numOutputs *= numberOfFilters;
+    const auto& biasPrecision = filterLayer->_biases ? filterLayer->_biases->getTensorDesc().getPrecision() : outputs->getPrecision();
     auto& currentComponent = dnnComponents.addComponent(layer->name, "affine");
 
-    dnn->InitAffineComponent(currentComponent,
-        num_rows_in + num_padding,
-        num_columns_in,
-        num_rows_out,
+    layer->params["num_rows_for_pwl"] = std::to_string(numOutputs);
+    dnn->InitConvolutional1DComponent(currentComponent,
+        numInputsFullyPadedAndAligned,
+        numOutputs,
         inputs->getPrecision().size(),
         outputs->getPrecision().size(),
         filterLayer->_weights->getTensorDesc().getPrecision().size(),
         biasPrecision.size(),
+        numberOfFilters,
+        filterWidth,
+        convolutionStride,
         quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
         quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
         ptr_inputs,
         ptr_outputs,
         ptr_weights,
-        ptr_biases,
-        false);
+        ptr_biases);
 
     size_t num_data_bytes_out =
         InferenceEngine::details::product(
             begin(outputs->getDims()), end(outputs->getDims())) * 4;
 
-    size_t num_data_bytes_in = num_columns_in *
-        ALIGN(num_rows_in, noOfInputsDivisor) * inputs->getPrecision().size();
+    size_t num_data_bytes_in = numInputsFullyPadedAndAligned * inputs->getPrecision().size();
 
     connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
     connectOutput(layer, ptr_outputs, num_data_bytes_out);
 
-    if (num_padding == 0) {
-        gnamem->readonly().push_ptr(ptr_weights,
-            filterLayer->_weights->cbuffer().as<const void*>(),
-            filterLayer->_weights->byteSize(),
-            64);
-    } else {
-        auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
-        auto paddedWeights = elementsIn * num_rows_out;
-        auto paddedWeightsSize = paddedWeights * filterLayer->precision.size();
-
-        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
-            size_t offset = 0;
-            for (uint32_t i = 0; i < num_rows_out && size >= offset; i++) {
-                ie_memcpy(reinterpret_cast<uint8_t*>(data) + offset, size - offset,
-                    filterLayer->_weights->cbuffer().as<const uint8_t*>() + num_rows_in * i * filterLayer->precision.size(),
-                    num_rows_in* filterLayer->precision.size());
-                offset += (num_rows_in + num_padding) * filterLayer->precision.size();
-            }
-            }, 64);
-    }
+    gnamem->readonly().push_ptr(ptr_weights,
+        filterLayer->_weights->cbuffer().as<const void*>(),
+        filterLayer->_weights->byteSize(),
+        64);
 
     if (filterLayer->_biases) {
         gnamem->readonly().push_ptr(ptr_biases,
@@ -1815,7 +1797,7 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
             filterLayer->_biases->byteSize(),
             64);
     } else {
-        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        gnamem->readonly().push_value(ptr_biases, 0.0f, numberOfFilters, 64);
     }
 }
 
@@ -1878,13 +1860,18 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
     }
 
     // TODO: solve this by layer level transformations
-    auto concatAlignFilter = CNNNetPrevLayer(layer, 0);
-    if (LayerInfo(concatAlignFilter).isConcatAlignFilter()) {
-        auto rowsCopiedOffset = concatAlignFilter->GetParamAsInt("rows_copied_offset");
+    auto prevLayer = CNNNetPrevLayer(layer, 0);
+    if (LayerInfo(prevLayer).isConcatAlignFilter()) {
+        auto rowsCopiedOffset = prevLayer->GetParamAsInt("rows_copied_offset");
         if (rowsCopiedOffset != 0) {
             num_rows -= rowsCopiedOffset / outputs->getPrecision().size();
             layer->params["output_offset"] = std::to_string(rowsCopiedOffset);
         }
+    } else if (LayerInfo(prevLayer).isConvolutionFilter()) {
+        const auto num_rows_for_pwl = prevLayer->GetParamAsInt("num_rows_for_pwl", 0);
+        if (num_rows_for_pwl != 0) {
+            num_rows = num_rows_for_pwl;
+        }
     }
     size_t num_data_bytes_out = num_columns * num_rows * outputs->getPrecision().size();
     size_t num_data_bytes_in = num_columns * num_rows * inputs->getPrecision().size();
@@ -2135,7 +2122,7 @@ void GNAGraphCompiler::CreateLayerPrimitive(CNNLayerPtr layer) {
         {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)},
         {{"Gemm"}, CREATE(GemmPrimitive)},
         {{"ScaleShift"}, CREATE(DiagonalPrimitive)},
-        {{"AffineFilter"}, CREATE(AffineFilterPrimitive)},
+        {{"ConvolutionFilter"}, CREATE(ConvolutionFilterPrimitive)},
         {{"ConcatAlignFilter"}, CREATE(ConcatAlignFilterPrimitive)},
         {{"Const"}, CREATE(ConstPrimitive)},
         {{"Eltwise"}, CREATE(EltwisePrimitive)},  // same as diagonal while weights are not taken from network, rather than from another output
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.hpp b/inference-engine/src/gna_plugin/gna_graph_compiler.hpp
index d761d917392..5aebc3aa158 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.hpp
@@ -108,7 +108,7 @@ public:
     void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
 
     void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
-    void AffineFilterPrimitive(InferenceEngine::CNNLayerPtr);
+    void ConvolutionFilterPrimitive(InferenceEngine::CNNLayerPtr);
     void ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr);
     void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
     void ConstPrimitive(InferenceEngine::CNNLayerPtr);
diff --git a/inference-engine/src/gna_plugin/layers/gna_convolution_layer.cpp b/inference-engine/src/gna_plugin/layers/gna_convolution_layer.cpp
new file mode 100644
index 00000000000..f226138251b
--- /dev/null
+++ b/inference-engine/src/gna_plugin/layers/gna_convolution_layer.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gna_convolution_layer.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <legacy/ie_layers.h>
+#include "gna_graph_tools.hpp"
+#include "gna_plugin_log.hpp"
+
+namespace GNAPluginNS {
+namespace GNAConvolutionLayer {
+bool isMappableFrom2DTo1D(const uint32_t inHeight, const uint32_t inWidth, const uint32_t kernelWidth, const uint32_t strideWidth) {
+    return inHeight > 1 && inWidth > 1 && inWidth == kernelWidth && strideWidth == 1;
+}
+
+// 3D input or 2D kernel
+bool isConv2D(const uint32_t inHeight, const uint32_t inWidth, const uint32_t inDepth,
+                 const uint32_t kernelHeight, const uint32_t kernelWidth) {
+    return (kernelHeight > 1 && kernelWidth > 1) || (inHeight > 1 && inWidth > 1 && inDepth > 1);
+}
+
+double getWeightsReducer(InferenceEngine::ConvolutionLayer& conv) {
+    using KRT = std::pair<uint32_t, double>;
+    // Empirically determined weights reducers for 2D Convolution
+    // i.e.:
+    // for kernelSize >= 9       -> 1.3
+    // for kernelSize in {7, 8}  -> 1.2
+    const std::vector< KRT > reducers{ {9, 1.3}, {7, 1.2} };
+    auto reducer = 1.0;
+    const auto inDepth = GetDataDimSize(conv.insData.front().lock(), InferenceEngine::DataDimName::C);
+    const auto inHeight = GetDataDimSize(conv.insData.front().lock(), InferenceEngine::DataDimName::H);
+    const auto inWidth = GetDataDimSize(conv.insData.front().lock(), InferenceEngine::DataDimName::W);
+    if (isConv2D(inHeight, inWidth, inDepth, conv._kernel_y, conv._kernel_x) &&
+         !isMappableFrom2DTo1D(inHeight, inWidth, conv._kernel_x, conv._stride_x)) {
+        const auto kernelSize = conv._kernel_x * conv._kernel_y;
+        auto r = std::lower_bound(reducers.begin(), reducers.end(), kernelSize,
+            [](const KRT& l, const KRT::first_type& r) {return l.first > r; });
+        if (r != reducers.end())
+            reducer = r->second;
+    }
+    return reducer;
+}
+
+uint32_t outputFromConv(const uint32_t in, const uint32_t flt, const uint32_t stride) {
+    // floor[(in - flt)/stride] + 1, GNA Spec 1.24
+    if (flt > in || flt == 0 || stride == 0) {
+        THROW_GNA_EXCEPTION << "Invalid (input, filter, stride) = (" << in << "," << flt << "," << stride << ")";
+    }
+    return (in - flt) / stride + 1;
+}
+
+uint32_t outputFromPooling(const uint32_t in, const uint32_t window, const uint32_t stride) {
+    // ceil[(in - window)/stride] + 1, GNA Spec 1.24
+    if (window > in || window == 0 || stride == 0) {
+        THROW_GNA_EXCEPTION << "Invalid (input, window, stride) = (" << in << "," << window << "," << stride << ")";
+    }
+    if (window == in) return 1;
+
+    return (in - window - 1) / stride + 2;
+}
+
+uint32_t outputFromPoolingLegacy(const uint32_t in, const uint32_t stride) {
+    // floor[(in - 1)/stride] + 1, GNA 1.0/2.0 HW Spec
+    // See issue 50386 for details
+    if (in == 0 || stride == 0) {
+        THROW_GNA_EXCEPTION << "Invalid (input, stride) = (" << in << "," << stride << ")";
+    }
+    return (in - 1) / stride + 1;
+}
+
+} // namespace GNAConvolutionLayer
+} // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/layers/gna_convolution_layer.hpp b/inference-engine/src/gna_plugin/layers/gna_convolution_layer.hpp
index e83d9b6c535..1ed7125b633 100644
--- a/inference-engine/src/gna_plugin/layers/gna_convolution_layer.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_convolution_layer.hpp
@@ -4,46 +4,25 @@
 
 #pragma once
 
-#include <algorithm>
-#include <cmath>
-#include <utility>
-#include <vector>
+#include <cstdint>
 
 #include <legacy/ie_layers.h>
-#include "../gna_graph_tools.hpp"
 
 namespace GNAPluginNS {
-struct GNAConvolutionLayer {
-    static bool isMappableFrom2DTo1D(const uint32_t inHeight, const uint32_t inWidth, const uint32_t kernelWidth, const uint32_t strideWidth) {
-        return inHeight > 1 && inWidth > 1 && inWidth == kernelWidth && strideWidth == 1;
-    }
+namespace GNAConvolutionLayer {
+bool isMappableFrom2DTo1D(const uint32_t inHeight, const uint32_t inWidth, const uint32_t kernelWidth, const uint32_t strideWidth);
 
-    // 3D input or 2D kernel
-    static bool isConv2D(const uint32_t inHeight, const uint32_t inWidth, const uint32_t inDepth,
-                     const uint32_t kernelHeight, const uint32_t kernelWidth) {
-        return (kernelHeight > 1 && kernelWidth > 1) || (inHeight > 1 && inWidth > 1 && inDepth > 1);
-    }
+// 3D input or 2D kernel
+bool isConv2D(const uint32_t inHeight, const uint32_t inWidth, const uint32_t inDepth,
+    const uint32_t kernelHeight, const uint32_t kernelWidth);
 
-    static double getWeightsReducer(InferenceEngine::ConvolutionLayer& conv) {
-        using KRT = std::pair<uint32_t, double>;
-        // Empirically determined weights reducers for 2D Convolution
-        // i.e.:
-        // for kernelSize >= 9       -> 1.3
-        // for kernelSize in {7, 8}  -> 1.2
-        const std::vector< KRT > reducers{ {9, 1.3}, {7, 1.2} };
-        auto reducer = 1.0;
-        const auto inDepth = GetDataDimSize(conv.insData.front().lock(), InferenceEngine::DataDimName::C);
-        const auto inHeight = GetDataDimSize(conv.insData.front().lock(), InferenceEngine::DataDimName::H);
-        const auto inWidth = GetDataDimSize(conv.insData.front().lock(), InferenceEngine::DataDimName::W);
-        if (isConv2D(inHeight, inWidth, inDepth, conv._kernel_y, conv._kernel_x) &&
-             !isMappableFrom2DTo1D(inHeight, inWidth, conv._kernel_x, conv._stride_x)) {
-            const auto kernelSize = conv._kernel_x * conv._kernel_y;
-            auto r = std::lower_bound(reducers.begin(), reducers.end(), kernelSize,
-                [](const KRT& l, const KRT::first_type& r) {return l.first > r; });
-            if (r != reducers.end())
-                reducer = r->second;
-        }
-        return reducer;
-    }
-};
-}  // namespace GNAPluginNS
+double getWeightsReducer(InferenceEngine::ConvolutionLayer& conv);
+
+uint32_t outputFromConv(const uint32_t in, const uint32_t flt, const uint32_t stride);
+
+uint32_t outputFromPooling(const uint32_t in, const uint32_t window, const uint32_t stride);
+
+uint32_t outputFromPoolingLegacy(const uint32_t in, const uint32_t stride);
+
+} // namespace GNAConvolutionLayer
+} // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
index 93fb4417dc7..53362a2d702 100644
--- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
@@ -70,6 +70,7 @@ class LayerInfo {
             [this]() { return isFullyConnected(); },
             [this]() { return isAffineFilter(); },
             [this]() { return isConcatAlignFilter(); },
+            [this]() { return isConvolutionFilter(); },
             [this]() { return isEltwise(); },
             [this]() { return isScaleShift(); },
             [this]() { return isConvolution(); },
@@ -157,6 +158,9 @@ class LayerInfo {
     bool isAffineFilter() const noexcept {
         return isOfType("AffineFilter");
     }
+    bool isConvolutionFilter() const noexcept {
+        return isOfType("ConvolutionFilter");
+    }
     bool isRelu() const noexcept {
         return isOfType("relu");
     }
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index f16645ae6cb..ae731465025 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -41,6 +41,7 @@
 #include "gna_data_types.hpp"
 #include "gna_tensor_tools.hpp"
 #include "gna_itt.hpp"
+#include "backend/gna_limitations.hpp"
 
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@@ -1277,35 +1278,49 @@ void InsertSplitAligningFilterPass::run() {
                     gnalog() << std::endl;
 #endif
                     auto filterLayer =
-                            std::make_shared<WeightableLayer>(LayerParams({filterName, "AffineFilter", Precision::FP32}));
+                            std::make_shared<ConvolutionLayer>(LayerParams({filterName, "ConvolutionFilter", Precision::FP32}));
 
                     auto inputData = splitOutput;
 
                     size_t aligned64_offset = std::max(0, static_cast<int>(ALIGN64(currentOffset) - 64));
-                    size_t
-                            newOutputSize = (currentOffset + ALIGN(outputSize, 8) * bytesPerSplitElement - aligned64_offset)
-                                            / bytesPerSplitElement;
 
                     IE_ASSERT(filterLayer != nullptr);
 
                     // encodes offset to beginning of split layer input
                     filterLayer->params["offset"] = std::to_string(aligned64_offset / bytesPerSplitElement);
-
                     auto dims = splitOutput->getTensorDesc().getDims();
                     if (dims.size() > 3) {
                         THROW_GNA_EXCEPTION << "unsupported split layer dims size: " << dims.size();
                     }
 
-                    auto num_rows_out = dims[1] * (dims.size() != 2 ? dims[2] : 1);
-                    std::vector<float> filterWeights(newOutputSize * num_rows_out, 0.f);
+                    const auto offsetOfUnalignment = (currentOffset - aligned64_offset) / bytesPerSplitElement;
+                    // TODO consider to use a different number of filters do decrese the number of trailing zeros (additionalPaddingOfFilter)
+                    const auto numberOfFilters = GNALimitations::convMinFiltersNum;
+                    const auto filterSize = ALIGN(offsetOfUnalignment + numberOfFilters, GNALimitations::convFilterSizeDivider);
 
-                    auto offset = (currentOffset - aligned64_offset) / bytesPerSplitElement;
-
-                    for (int i = 0; i != outputSize; i++) {
-                        filterWeights[offset] = 1.0f;
-                        offset += newOutputSize + 1;
+                    // filterWeights: numberOfFilters X (offsetOfUnalignment + additionalPaddingOfFilter + numberOfFilters)
+                    // offsetOfUnalignment - the leading zeros in the filter
+                    //       |
+                    //       |             additionalPaddingOfFilter = filterSize - offsetOfUnalignment - numberOfFilters
+                    //   ____|___         ___|___
+                    //  |        |       |       |
+                    //  0 0 ... 0 1 0 0 0 0 ... 0
+                    //  0 0 ... 0 0 1 0 0 0 ... 0
+                    //  0 0 ... 0 0 0 1 0 0 ... 0
+                    //  0 0 ... 0 0 0 0 1 0 ... 0
+                    std::vector<float> filterWeights(filterSize * 4, 0.f);
+                    for (auto f = 0u; f < numberOfFilters; f++) {
+                        filterWeights[f * filterSize + f + offsetOfUnalignment] = 1;
                     }
 
+                    filterLayer->_out_depth = numberOfFilters;
+                    filterLayer->_stride_x = numberOfFilters;
+                    filterLayer->_stride_y = 1;
+                    filterLayer->_kernel_x = filterSize;
+                    filterLayer->_kernel_y = 1;
+                    filterLayer->_padding_x = 0;
+                    filterLayer->_padding_y = 0;
+
                     filterLayer->_weights = make_shared_blob<float>(TensorDesc(
                             inputData->getTensorDesc().getPrecision(),
                             SizeVector({filterWeights.size()}),
@@ -1313,6 +1328,15 @@ void InsertSplitAligningFilterPass::run() {
                     filterLayer->_weights->allocate();
                     CopyVectorToBlob(filterLayer->_weights, filterWeights);
 
+                    std::vector<float> biasWeights(numberOfFilters, 0.f);
+
+                    filterLayer->_biases = make_shared_blob<float>(TensorDesc(
+                        inputData->getTensorDesc().getPrecision(),
+                        SizeVector({ biasWeights.size() }),
+                        Layout::C));
+                    filterLayer->_biases->allocate();
+                    CopyVectorToBlob(filterLayer->_biases, biasWeights);
+
                     auto outData = std::make_shared<Data>(filterName,
                                                           TensorDesc(splitOutput->getTensorDesc().getPrecision(),
                                                                      splitOutput->getTensorDesc().getDims(),
diff --git a/inference-engine/src/gna_plugin/runtime/cnn.cpp b/inference-engine/src/gna_plugin/runtime/cnn.cpp
index e4824b84b06..2e0071040f9 100644
--- a/inference-engine/src/gna_plugin/runtime/cnn.cpp
+++ b/inference-engine/src/gna_plugin/runtime/cnn.cpp
@@ -12,7 +12,9 @@
 #include "backend/dnn_types.h"
 #include "backend/gna_limitations.hpp"
 #include "gna_lib_ver_selector.hpp"
+#include "layers/gna_convolution_layer.hpp"
 
+using namespace GNAPluginNS::GNAConvolutionLayer;
 
 void CNNFilter32(intel_dnn_component_t *component) {
     auto filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
@@ -20,11 +22,10 @@ void CNNFilter32(intel_dnn_component_t *component) {
     auto input = reinterpret_cast<float *>(component->ptr_inputs);
     auto output = reinterpret_cast<float *>(component->ptr_outputs);
 
-    const auto convolutionStride = component->op.conv1D.num_feature_map_columns;
+    const auto convolutionStride = component->op.conv1D.convStride;
     const auto filterSize = component->op.conv1D.num_filter_coefficients;
     const auto numberOfInputs = component->num_columns_in;
-    // TODO: reuse outputFromConv() from backend\am_intel_dnn.cpp
-    const auto numberOfOutputsPerFilter = (numberOfInputs - filterSize) / convolutionStride + 1;
+    const auto numberOfOutputsPerFilter = outputFromConv(numberOfInputs, filterSize, convolutionStride);
     const auto numberOfFilters = component->op.conv1D.num_filters;
 
     std::string layer_name;
diff --git a/inference-engine/src/inference_engine/ie_core.cpp b/inference-engine/src/inference_engine/ie_core.cpp
index 9582c12f941..3c101ea4271 100644
--- a/inference-engine/src/inference_engine/ie_core.cpp
+++ b/inference-engine/src/inference_engine/ie_core.cpp
@@ -775,7 +775,7 @@ public:
     }
 
     /**
-     * @brief Porvides a list of plugin names in registry; physically such plugins may not be created
+     * @brief Provides a list of plugin names in registry; physically such plugins may not be created
      * @return A list of plugin names
      */
     std::vector<std::string> GetListOfDevicesInRegistry() const {
diff --git a/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp b/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp
index ec0d5c1526a..31c148ec111 100644
--- a/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp
+++ b/inference-engine/src/inference_engine/os/win/win_shared_object_loader.cpp
@@ -98,7 +98,7 @@ class SharedObjectLoader::Impl {
     // Exclude current directory from DLL search path process wise.
     // If application specific path was configured before then
     // current directory is already excluded.
-    // GetDLLDirectory does not distinguish if aplication specific
+    // GetDLLDirectory does not distinguish if application specific
     // path was set to "" or NULL so reset it to "" to keep
     // application safe.
     void ExcludeCurrentDirectoryA() {
diff --git a/inference-engine/src/offline_transformations/src/pruning/init_const_mask.cpp b/inference-engine/src/offline_transformations/src/pruning/init_const_mask.cpp
index 01e9520082b..b27c28e772d 100644
--- a/inference-engine/src/offline_transformations/src/pruning/init_const_mask.cpp
+++ b/inference-engine/src/offline_transformations/src/pruning/init_const_mask.cpp
@@ -40,6 +40,7 @@ ngraph::pass::InitConstMask::InitConstMask(const ngraph::AxisSet & dims,
                 end[dim] = value + 1;
 
                 bool skip_dim_value = false;
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform iter(shape, begin, end);
                 for (const Coordinate & coord : iter) {
                     if (!condition(values.at(iter.index(coord)))) {
@@ -47,6 +48,7 @@ ngraph::pass::InitConstMask::InitConstMask(const ngraph::AxisSet & dims,
                         break;
                     }
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
                 if (!skip_dim_value) {
                     mask->at(dim).insert(value);
                 }
diff --git a/inference-engine/src/readers/ir_reader_v7/CMakeLists.txt b/inference-engine/src/readers/ir_reader_v7/CMakeLists.txt
index b79f630ed9d..4ee39f31358 100644
--- a/inference-engine/src/readers/ir_reader_v7/CMakeLists.txt
+++ b/inference-engine/src/readers/ir_reader_v7/CMakeLists.txt
@@ -16,7 +16,7 @@ source_group("src" FILES ${LIBRARY_SRC})
 
 # Create module library
 
-add_library(${TARGET_NAME} MODULE EXCLUDE_FROM_ALL ${LIBRARY_SRC})
+add_library(${TARGET_NAME} MODULE ${LIBRARY_SRC})
 
 ie_faster_build(${TARGET_NAME}
     UNITY
diff --git a/inference-engine/src/transformations/include/transformations/common_optimizations/eliminate_unsqueeze_gather.hpp b/inference-engine/src/transformations/include/transformations/common_optimizations/eliminate_unsqueeze_gather.hpp
index 141ef7d774d..012d78a4774 100644
--- a/inference-engine/src/transformations/include/transformations/common_optimizations/eliminate_unsqueeze_gather.hpp
+++ b/inference-engine/src/transformations/include/transformations/common_optimizations/eliminate_unsqueeze_gather.hpp
@@ -14,6 +14,7 @@ namespace ngraph {
 namespace pass {
 
 class TRANSFORMATIONS_API EliminateUnsqueezeGather;
+class TRANSFORMATIONS_API EliminateGatherUnsqueeze;
 
 }  // namespace pass
 }  // namespace ngraph
@@ -29,3 +30,15 @@ public:
     NGRAPH_RTTI_DECLARATION;
     EliminateUnsqueezeGather();
 };
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief Remove Gather -> Unsqueeze pair, if Gather takes a scalar and
+ * Unsqueeze makes it a 1D tensor
+ */
+
+class ngraph::pass::EliminateGatherUnsqueeze : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    EliminateGatherUnsqueeze();
+};
diff --git a/inference-engine/src/transformations/include/transformations/common_optimizations/simplify_shape_of_sub_graph.hpp b/inference-engine/src/transformations/include/transformations/common_optimizations/simplify_shape_of_sub_graph.hpp
new file mode 100644
index 00000000000..85d8aa662da
--- /dev/null
+++ b/inference-engine/src/transformations/include/transformations/common_optimizations/simplify_shape_of_sub_graph.hpp
@@ -0,0 +1,60 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include <transformations_visibility.hpp>
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/util.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API SimplifyShapeOfSubGraph;
+class TRANSFORMATIONS_API SharedShapeOf;
+class TRANSFORMATIONS_API GroupedGatherElimination;
+
+}  // namespace pass
+}  // namespace ngraph
+
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief SharedShapeOf transformation replaces group of ShapeOf
+ * operations with the first ShapeOf in this group. All ShapeOfs in this group
+ * must be equal and consume the same output port.
+ */
+class ngraph::pass::SharedShapeOf: public ngraph::pass::FunctionPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    bool run_on_function(std::shared_ptr<ngraph::Function> f) override;
+};
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief GroupedGatherElimination transformation replaces group of Gather
+ * operations with the first Gather in this group and updated indices input
+ * in case all Gathers in the group are consumed by the same Concat in incremental order.
+ */
+class ngraph::pass::GroupedGatherElimination: public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    GroupedGatherElimination();
+};
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief SimplifyShapeOfSubGraph transformation runs specific optimizations of shape sub-graphs
+ */
+class ngraph::pass::SimplifyShapeOfSubGraph: public ngraph::pass::FunctionPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    bool run_on_function(std::shared_ptr<ngraph::Function> f) override;
+};
diff --git a/inference-engine/src/transformations/include/transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp b/inference-engine/src/transformations/include/transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp
new file mode 100644
index 00000000000..f33f6b9a83a
--- /dev/null
+++ b/inference-engine/src/transformations/include/transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <transformations_visibility.hpp>
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API ConvertDeformableConv8To1;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief ConvertDeformableConv8To1 converts v8::DeformableConvolution into v1::DeformableConvolution.
+ */
+class ngraph::pass::ConvertDeformableConv8To1 : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    ConvertDeformableConv8To1();
+};
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
index ffe80ec9639..956904c9a34 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
@@ -76,6 +76,7 @@
 #include <ngraph/pass/manager.hpp>
 #include <ngraph/pass/constant_folding.hpp>
 #include <transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp>
+#include <transformations/common_optimizations/simplify_shape_of_sub_graph.hpp>
 
 NGRAPH_RTTI_DEFINITION(ngraph::pass::CommonOptimizations, "CommonOptimizations", 0);
 
@@ -85,6 +86,7 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
 
     // This pass must be called first in pipeline
     manager.register_pass<ngraph::pass::InitNodeInfo>();
+    manager.register_pass<ngraph::pass::SimplifyShapeOfSubGraph>();
     manager.register_pass<ngraph::pass::ConstantFolding>();
     manager.register_pass<ngraph::pass::RemoveFilteringBoxesBySize>(); // Resolves dynamism (replaces NonZero), CF needed
 
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/eliminate_unsqueeze_gather.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/eliminate_unsqueeze_gather.cpp
index fae3b71ac1e..ec3fafdea39 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/eliminate_unsqueeze_gather.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/eliminate_unsqueeze_gather.cpp
@@ -7,6 +7,7 @@
 #include <ngraph/opsets/opset6.hpp>
 #include <ngraph/pattern/op/wrap_type.hpp>
 #include <ngraph/rt_info.hpp>
+#include <transformations/utils/utils.hpp>
 #include "itt.hpp"
 
 NGRAPH_RTTI_DEFINITION(ngraph::pass::EliminateUnsqueezeGather, "EliminateUnsqueezeGather", 0);
@@ -58,3 +59,36 @@ ngraph::pass::EliminateUnsqueezeGather::EliminateUnsqueezeGather() {
     auto m = std::make_shared<ngraph::pattern::Matcher>(gather, "EliminateUnsqueezeGather");
     register_matcher(m, callback);
 }
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::EliminateGatherUnsqueeze, "EliminateGatherUnsqueeze", 0);
+
+ngraph::pass::EliminateGatherUnsqueeze::EliminateGatherUnsqueeze() {
+    MATCHER_SCOPE(EliminateGatherUnsqueeze);
+
+    const auto gather_indices_label = ngraph::pattern::wrap_type<ngraph::op::Constant>(pattern::rank_equals(0));
+    const auto gather_axis_label = ngraph::pattern::wrap_type<ngraph::op::Constant>();
+    const auto gather_label = ngraph::pattern::wrap_type<ngraph::op::util::GatherBase>(
+            {ngraph::pattern::any_input(), gather_indices_label, gather_axis_label}, pattern::rank_equals(0));
+
+    const auto unsqueeze_label = ngraph::pattern::wrap_type<ngraph::opset6::Unsqueeze>(
+            {gather_label, ngraph::pattern::any_input()}, pattern::rank_equals(1));
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        auto pattern_nodes = m.get_pattern_map();
+
+        auto& gather_indices = pattern_nodes.at(gather_indices_label);
+        auto& gather = pattern_nodes.at(gather_label);
+        auto& unsqueeze = pattern_nodes.at(unsqueeze_label);
+
+        auto new_indices = ngraph::op::util::make_try_fold<ngraph::opset6::Reshape>(gather_indices, opset6::Constant::create(element::i32, {1}, {1}), false);
+        auto new_gather = gather->clone_with_new_inputs({gather->input_value(0), new_indices, gather->input_value(2)});
+
+        new_gather->set_friendly_name(gather->get_friendly_name());
+        ngraph::copy_runtime_info({unsqueeze, gather}, {new_gather, new_indices});
+        ngraph::replace_node(unsqueeze, new_gather);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(unsqueeze_label, "EliminateGatherUnsqueeze");
+    register_matcher(m, callback);
+}
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp
new file mode 100644
index 00000000000..4aeae1d8f14
--- /dev/null
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+#include <vector>
+
+#include "itt.hpp"
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <transformations/common_optimizations/simplify_shape_of_sub_graph.hpp>
+#include <transformations/common_optimizations/eliminate_unsqueeze_gather.hpp>
+#include <transformations/utils/utils.hpp>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::SharedShapeOf, "SharedShapeOf", 0);
+
+bool ngraph::pass::SharedShapeOf::run_on_function(std::shared_ptr<ngraph::Function> f) {
+    RUN_ON_FUNCTION_SCOPE(SharedShapeOf);
+    bool graph_rewritten = false;
+
+    std::map<ngraph::Output<Node>, std::vector<std::shared_ptr<ngraph::Node>>> source_to_shape_of;
+    for (const auto & node : f->get_ordered_ops()) {
+        // Recursively apply transformation for sub-graph based operations
+        if (auto sub_graph_node = std::dynamic_pointer_cast<op::util::SubGraphOp>(node))
+            if (auto sub_graph = sub_graph_node->get_function())
+                graph_rewritten |= run_on_function(sub_graph);
+
+        if (is_type<ngraph::opset1::ShapeOf>(node) || is_type<ngraph::opset3::ShapeOf>(node))
+            source_to_shape_of[node->input_value(0)].push_back(node);
+    }
+
+    for (const auto& pair : source_to_shape_of) {
+        if (pair.second.size() < 2)
+            continue;
+        const auto& root_ss = pair.second[0];
+        for (const auto& child_ss : pair.second)
+            if (root_ss->get_instance_id() != child_ss->get_instance_id() && root_ss->get_output_element_type(0) == root_ss->get_output_element_type(0))
+                graph_rewritten |= replace_output_update_name(child_ss->output(0), root_ss->output(0));
+    }
+    return graph_rewritten;
+}
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::GroupedGatherElimination, "GroupedGatherElimination", 0);
+
+ngraph::pass::GroupedGatherElimination::GroupedGatherElimination() {
+    MATCHER_SCOPE(GroupedGatherElimination);
+    auto concat_label = ngraph::pattern::wrap_type<ngraph::opset1::Concat>(pattern::rank_equals(1));
+
+    ngraph::matcher_pass_callback callback = [](pattern::Matcher& m) {
+        auto concat = m.get_match_root();
+        OutputVector inputs = concat->input_values();
+        NodeVector new_ops;
+        size_t i = 0, original_inputs_size = inputs.size();
+        while (inputs.size() > i + 1) {
+            auto curr = inputs[i].get_node_shared_ptr(), next = inputs[i + 1].get_node_shared_ptr();
+            if (curr->get_type_info() != next->get_type_info() ||
+                (!is_type<opset1::Gather>(curr) && !is_type<opset7::Gather>(curr)) ||
+                (curr->input_value(0) != next->input_value(0))) {
+                ++i;
+                continue;
+            } // curr and next are the same type of gather which takes data from the same source
+            auto joint_indices = ngraph::op::util::make_try_fold<opset1::Concat>(OutputVector{curr->input_value(1), next->input_value(1)}, 0);
+            auto new_gather = curr->clone_with_new_inputs(
+                    {curr->input_value(0), joint_indices, ngraph::opset1::Constant::create(element::i64, {}, {0})});
+            new_ops.push_back(joint_indices);
+            new_ops.push_back(new_gather);
+            inputs.erase(inputs.begin() + i);
+            inputs[i] = new_gather->output(0);
+        }
+        if (original_inputs_size > inputs.size()) {
+            auto new_concat = std::make_shared<opset1::Concat>(inputs, 0);
+            new_ops.push_back(new_concat);
+            new_concat->set_friendly_name(concat->get_friendly_name());
+            ngraph::copy_runtime_info(concat, new_ops);
+            ngraph::replace_node(concat, new_concat);
+            return true;
+        }
+        return false;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(concat_label, matcher_name);
+    this->register_matcher(m, callback);
+}
+
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::SimplifyShapeOfSubGraph, "SimplifyShapeOfSubGraph", 0);
+
+bool ngraph::pass::SimplifyShapeOfSubGraph::run_on_function(std::shared_ptr<ngraph::Function> f) {
+    RUN_ON_FUNCTION_SCOPE(GroupedGatherElimination);
+    ngraph::pass::Manager manager;
+    manager.set_per_pass_validation(false);
+    manager.register_pass<ngraph::pass::EliminateGatherUnsqueeze>();
+    manager.register_pass<ngraph::pass::SharedShapeOf>();
+    manager.register_pass<ngraph::pass::GroupedGatherElimination>();
+    manager.register_pass<ngraph::pass::Validate>();
+    manager.run_passes(f);
+    return false;
+}
diff --git a/inference-engine/src/transformations/src/transformations/op_conversions/convert_deformable_conv_v8_to_v1.cpp b/inference-engine/src/transformations/src/transformations/op_conversions/convert_deformable_conv_v8_to_v1.cpp
new file mode 100644
index 00000000000..a5d2f8e123a
--- /dev/null
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_deformable_conv_v8_to_v1.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp"
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset8.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "itt.hpp"
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::ConvertDeformableConv8To1, "ConvertDeformableConv8To1", 0);
+
+ngraph::pass::ConvertDeformableConv8To1::ConvertDeformableConv8To1() {
+    MATCHER_SCOPE(ConvertDeformableConv8To1);
+
+    auto deformable_conv_v8 = pattern::wrap_type<ngraph::opset8::DeformableConvolution>();
+
+    ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
+        auto deformable_conv_v8_node = std::dynamic_pointer_cast<ngraph::opset8::DeformableConvolution>(m.get_match_root());
+        if (!deformable_conv_v8_node)
+            return false;
+
+        if (deformable_conv_v8_node->get_input_size() != 3
+            || deformable_conv_v8_node->get_bilinear_interpolation_pad())
+            return false;
+
+        auto arg = deformable_conv_v8_node->input_value(0);
+        auto offsets = deformable_conv_v8_node->input_value(1);
+        auto filters = deformable_conv_v8_node->input_value(2);
+
+        auto deformable_conv_v1 =
+                std::make_shared<ngraph::opset1::DeformableConvolution>(arg,
+                                                                        offsets,
+                                                                        filters,
+                                                                        deformable_conv_v8_node->get_strides(),
+                                                                        deformable_conv_v8_node->get_pads_begin(),
+                                                                        deformable_conv_v8_node->get_pads_end(),
+                                                                        deformable_conv_v8_node->get_dilations(),
+                                                                        deformable_conv_v8_node->get_auto_pad(),
+                                                                        deformable_conv_v8_node->get_group(),
+                                                                        deformable_conv_v8_node->get_deformable_group());
+        deformable_conv_v1->set_friendly_name(deformable_conv_v8_node->get_friendly_name());
+        ngraph::copy_runtime_info(deformable_conv_v8_node, deformable_conv_v1);
+        ngraph::replace_node(deformable_conv_v8_node, deformable_conv_v1);
+        return true;
+    };
+
+    auto m = std::make_shared<pattern::Matcher>(deformable_conv_v8, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/inference-engine/src/vpu/graph_transformer/src/frontend/ie_parsed_network.cpp b/inference-engine/src/vpu/graph_transformer/src/frontend/ie_parsed_network.cpp
index 49ee5f1caae..a8c0bc0a8f5 100644
--- a/inference-engine/src/vpu/graph_transformer/src/frontend/ie_parsed_network.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/frontend/ie_parsed_network.cpp
@@ -27,7 +27,6 @@ IeParsedNetwork parseNetwork(const ie::CNNNetwork& network) {
     out.networkOutputs = network.getOutputsInfo();
 
     env.log->trace("Got %d inputs and %d outputs", out.networkInputs.size(), out.networkOutputs.size());
-    IE_ASSERT(!out.networkInputs.empty());
     IE_ASSERT(!out.networkOutputs.empty());
 
     env.log->trace("Perform topological sort");
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
index f2cd14bbb15..3ddcc2fcc45 100644
--- a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
@@ -7,6 +7,7 @@
 
 #include <ie_metric_helpers.hpp>
 #include <legacy/cnn_network_impl.hpp>
+#include <legacy/convert_function_to_cnn_network.hpp>
 #include "exec_graph_info.hpp"
 #include <myriad_executable_network.h>
 #include <vpu/blob_reader.hpp>
@@ -25,7 +26,6 @@ namespace MyriadPlugin {
 
 ExecutableNetwork::ExecutableNetwork(
         std::shared_ptr<IMvnc> mvnc,
-        std::vector<DevicePtr>& devicePool,
         const MyriadConfiguration& config,
         const std::shared_ptr<ie::ICore> core) :
             _config(config),
@@ -40,10 +40,6 @@ ExecutableNetwork::ExecutableNetwork(
         defaultOutput(_config.pluginLogFilePath()));
 
     _executor = std::make_shared<MyriadExecutor>(_config.forceReset(), std::move(mvnc), logLevel, _log);
-    _device = _executor->openDevice(devicePool, _config);
-
-    const auto& revision = _device->revision();
-    _actualNumExecutors = config.compileConfig().numExecutors != -1 ? config.compileConfig().numExecutors : DefaultAllocation::numStreams(revision, config);
 
     _supportedMetrics = {
         METRIC_KEY(NETWORK_NAME),
@@ -54,13 +50,19 @@ ExecutableNetwork::ExecutableNetwork(
     };
 }
 
+void ExecutableNetwork::openDevice(std::vector<DevicePtr>& devicePool) {
+    _device = _executor->openDevice(devicePool, _config);
+    const auto& revision = _device->revision();
+    _actualNumExecutors = _config.compileConfig().numExecutors != -1 ? _config.compileConfig().numExecutors : DefaultAllocation::numStreams(revision, _config);
+}
+
 ExecutableNetwork::ExecutableNetwork(
         const ie::CNNNetwork& network,
         std::shared_ptr<IMvnc> mvnc,
         std::vector<DevicePtr>& devicePool,
         const MyriadConfiguration& config,
         const std::shared_ptr<ie::ICore> core) :
-            ExecutableNetwork(std::move(mvnc), devicePool, config, core) {
+            ExecutableNetwork(std::move(mvnc), config, core) {
     VPU_PROFILE(ExecutableNetwork);
 
     const auto compilerLog = std::make_shared<Logger>(
@@ -68,11 +70,9 @@ ExecutableNetwork::ExecutableNetwork(
         _config.get<LogLevelOption>(),
         defaultOutput(_config.compilerLogFilePath()));
 
-    if (_device == nullptr)
-        IE_THROW() << "No device was detected";
     auto compiledGraph = compileNetwork(
         network,
-        _device->_platform,
+        NC_MYRIAD_X,
         _config,
         compilerLog,
         _core);
@@ -84,12 +84,7 @@ ExecutableNetwork::ExecutableNetwork(
     _inputInfo  = std::move(compiledGraph->inputInfo);
     _outputInfo = std::move(compiledGraph->outputInfo);
 
-    if (!_device->isBooted()) {
-        return;
-    }
-
     const auto& networkName = network.getName();
-    _executor->allocateGraph(_device, _graphDesc, _graphBlob, compiledGraph->blobHeader, compiledGraph->numActiveStages, networkName, _actualNumExecutors);
     if (_config.exclusiveAsyncRequests()) {
         ExecutorManager *executorManager = ExecutorManager::getInstance();
         _taskExecutor = executorManager->getExecutor("MYRIAD");
@@ -100,6 +95,21 @@ ExecutableNetwork::ExecutableNetwork(
         idStream << networkName << "_TaskExecutorGetResult" << i;
         _taskExecutorGetResultIds.emplace(idStream.str());
     }
+    if (_inputInfo.totalSize == 0) {
+        _isNetworkConstant = true;
+        const auto& nGraphFunc = network.getFunction();
+        const auto& sortedLayers = nGraphFunc->get_ordered_ops();
+        for (const auto& layer : sortedLayers) {
+            if (strcmp(layer->get_type_info().name, "Constant") == 0) {
+                const auto& constOp = std::dynamic_pointer_cast<ngraph::op::v0::Constant>(layer);
+                auto name = constOp->get_friendly_name();
+                _constDatas[name] = ie::details::shareWeights(constOp);
+            }
+        }
+        return;
+    }
+    openDevice(devicePool);
+    _executor->allocateGraph(_device, _graphDesc, _graphBlob, compiledGraph->blobHeader, compiledGraph->numActiveStages, networkName, _actualNumExecutors);
 }
 
 void ExecutableNetwork::Import(std::istream& strm, std::vector<DevicePtr> &devicePool, const MyriadConfiguration& configuration) {
@@ -110,10 +120,6 @@ void ExecutableNetwork::Import(std::istream& strm, std::vector<DevicePtr> &devic
     strm.seekg(currentPos, strm.beg);
     strm.read(&_graphBlob[0], blobSize);
 
-    if (!_device->isBooted()) {
-        return;
-    }
-
     std::string networkName = importedNetworkName;
 
     BlobReader blobReader;
@@ -126,9 +132,8 @@ void ExecutableNetwork::Import(std::istream& strm, std::vector<DevicePtr> &devic
 
     _inputInfo  = blobReader.getInputInfo();
     _outputInfo = blobReader.getOutputInfo();
-
+    openDevice(devicePool);
     _executor->allocateGraph(_device, _graphDesc, _graphBlob, blobHeader, numStages, networkName, _actualNumExecutors);
-
     _graphMetaData.stagesMeta.resize(numStages);
     for (auto &meta : _graphMetaData.stagesMeta) {
         meta.stageName = meta.stageType = meta.layerName = meta.layerType = "UNKNOWN";
@@ -147,9 +152,12 @@ void ExecutableNetwork::Import(std::istream& strm, std::vector<DevicePtr> &devic
     }
 }
 
-ExecutableNetwork::ExecutableNetwork(std::istream& strm, std::shared_ptr<IMvnc> mvnc, std::vector<DevicePtr> &devicePool,
-    const MyriadConfiguration& config, const std::shared_ptr<ie::ICore> core) :
-    ExecutableNetwork(std::move(mvnc), devicePool, config, core) {
+ExecutableNetwork::ExecutableNetwork(std::istream& strm,
+                               std::shared_ptr<IMvnc> mvnc,
+                               std::vector<DevicePtr> &devicePool,
+                               const MyriadConfiguration& config,
+                               const std::shared_ptr<ie::ICore> core) :
+    ExecutableNetwork(std::move(mvnc), config, core) {
     VPU_PROFILE(ExecutableNetwork);
     Import(strm, devicePool, config);
 }
@@ -160,7 +168,7 @@ ExecutableNetwork::ExecutableNetwork(
         std::vector<DevicePtr>& devicePool,
         const MyriadConfiguration& config,
         const std::shared_ptr<ie::ICore> core) :
-    ExecutableNetwork(std::move(mvnc), devicePool, config, core) {
+    ExecutableNetwork(std::move(mvnc), config, core) {
     VPU_PROFILE(ExecutableNetwork);
     std::ifstream blobFile{blobFilename, std::ios::binary};
     Import(blobFile, devicePool, config);
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h
index 51f9d10ad56..94ed3e76b11 100644
--- a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h
@@ -44,7 +44,9 @@ public:
 
     virtual ~ExecutableNetwork() {
         try {
-            _executor->deallocateGraph(_device, _graphDesc);
+            if (_device != nullptr) {
+                _executor->deallocateGraph(_device, _graphDesc);
+            }
         }
         catch (...) {
             std::cerr << "ERROR ~ExecutableNetwork():\n"
@@ -54,18 +56,19 @@ public:
 
     ie::IInferRequestInternal::Ptr CreateInferRequestImpl(ie::InputsDataMap networkInputs,
                                                          ie::OutputsDataMap networkOutputs) override {
-        if (_device == nullptr || !_device->isBooted()) {
+        if (!_isNetworkConstant && (_device == nullptr || !_device->isBooted())) {
             IE_THROW() << "Can not create infer request: there is no available devices with platform "
                                << _device->_platform;
         }
 
         return std::make_shared<MyriadInferRequest>(_graphDesc, networkInputs, networkOutputs,
                                                     _inputInfo, _outputInfo,
-                                                    _graphMetaData.stagesMeta, _config, _log, _executor);
+                                                    _graphMetaData.stagesMeta, _config, _log, _executor,
+                                                    _constDatas, _isNetworkConstant);
     }
 
     ie::IInferRequestInternal::Ptr CreateInferRequest() override {
-        if (_device == nullptr || !_device->isBooted()) {
+        if (!_isNetworkConstant && (_device == nullptr || !_device->isBooted())) {
             IE_THROW() << "Can not create infer request: there is no available devices with platform "
                                << _device->_platform;
         }
@@ -73,7 +76,7 @@ public:
         auto syncRequestImpl = std::make_shared<MyriadInferRequest>(_graphDesc, _networkInputs, _networkOutputs,
                                                                     _inputInfo, _outputInfo,
                                                                     _graphMetaData.stagesMeta, _config, _log,
-                                                                    _executor);
+                                                                    _executor, _constDatas, _isNetworkConstant);
         syncRequestImpl->setPointerToExecutableNetworkInternal(shared_from_this());
         auto taskExecutorGetResult = getNextTaskExecutor();
         return std::make_shared<MyriadAsyncInferRequest>(
@@ -84,6 +87,16 @@ public:
         model.write(_graphBlob.data(), _graphBlob.size());
     }
 
+    void Export(const std::string &modelFileName) override {
+        std::ofstream modelFile(modelFileName, std::ios::out | std::ios::binary);
+
+        if (modelFile.is_open()) {
+            Export(modelFile);
+        } else {
+            IE_THROW() << "The " << modelFileName << " file can not be opened for export";
+        }
+    }
+
     ie::Parameter GetMetric(const std::string &name) const override;
 
     ie::CNNNetwork GetExecGraphInfo() override;
@@ -98,9 +111,11 @@ private:
     DevicePtr _device;
     GraphMetaInfo _graphMetaData;
     MyriadConfiguration _config;
+    bool _isNetworkConstant = false;
     const std::shared_ptr<ie::ICore> _core = nullptr;
     int _actualNumExecutors = 0;
     std::vector<std::string> _supportedMetrics;
+    std::map<std::string, ie::Blob::Ptr> _constDatas;
 
     DataInfo _inputInfo;
     DataInfo _outputInfo;
@@ -109,9 +124,8 @@ private:
     std::queue<std::string> _taskExecutorGetResultIds;
 
     ExecutableNetwork(std::shared_ptr<IMvnc> mvnc,
-                      std::vector<DevicePtr> &devicePool,
-                      const MyriadConfiguration& config,
-                      const std::shared_ptr<ie::ICore> core);
+        const MyriadConfiguration& config,
+        const std::shared_ptr<ie::ICore> core);
 
     ie::ITaskExecutor::Ptr getNextTaskExecutor() {
         std::string id = _taskExecutorGetResultIds.front();
@@ -124,6 +138,8 @@ private:
 
         return taskExecutor;
     }
+
+    void openDevice(std::vector<DevicePtr>& devicePool);
 };
 
 }  // namespace MyriadPlugin
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp
index 020a3d9e580..a3662543a0f 100644
--- a/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.cpp
@@ -33,11 +33,13 @@ MyriadInferRequest::MyriadInferRequest(GraphDesc &graphDesc,
                                        const std::vector<StageMetaInfo> &blobMetaData,
                                        const MyriadConfig& myriadConfig,
                                        const Logger::Ptr &log,
-                                       const MyriadExecutorPtr &executor) :
+                                       const MyriadExecutorPtr &executor,
+                                       std::map<std::string, ie::Blob::Ptr> constDatas,
+                                       bool isNetworkConstant = true) :
         IInferRequestInternal(networkInputs, networkOutputs), _executor(executor),
         _log(log), _stagesMetaData(blobMetaData), _config(myriadConfig),
         _inputInfo(compilerInputsInfo), _outputInfo(compilerOutputsInfo),
-        _graphDesc(graphDesc) {
+        _graphDesc(graphDesc), _constDatas(constDatas), _isNetworkConstant(isNetworkConstant) {
     VPU_PROFILE(MyriadInferRequest);
 
     const auto& ioStrides = _config.compileConfig().ioStrides;
@@ -83,7 +85,7 @@ MyriadInferRequest::MyriadInferRequest(GraphDesc &graphDesc,
     resultBuffer.resize(compilerOutputsInfo.totalSize);
 
     VPU_THROW_UNLESS(
-        !_networkOutputs.empty() && !_networkInputs.empty(),
+        !_networkOutputs.empty() && !(_networkInputs.empty() && !_isNetworkConstant),
         "No information about network's output/input");
 }
 
@@ -93,6 +95,9 @@ void MyriadInferRequest::InferImpl() {
 }
 
 void MyriadInferRequest::InferAsync() {
+    if (_isNetworkConstant) {
+        return;
+    }
     VPU_PROFILE(InferAsync);
 
     // execute input pre-processing
@@ -104,7 +109,7 @@ void MyriadInferRequest::InferAsync() {
     auto getOffset = [&inputInfo] (const std::string& name) {
         const auto offsetIt = inputInfo.offset.find(name);
         IE_ASSERT(offsetIt != inputInfo.offset.end()) << "MyriadInferRequest::InferAsync()\n"
-                                                      << "Input offset [" << name << "] is not provided.";
+                                                    << "Input offset [" << name << "] is not provided.";
         return offsetIt->second;
     };
 
@@ -123,9 +128,9 @@ void MyriadInferRequest::InferAsync() {
         const auto byteSize = blob->byteSize();
         const auto requiredSize = vpu::checked_cast<size_t>(offset) + byteSize;
         IE_ASSERT(requiredSize <= inputBuffer.size())  << "MyriadInferRequest::InferAsync()\n"
-                                                       << "Input offset is too big. "
-                                                       << "Required size: " << requiredSize
-                                                       << ", Input buffer size: " << inputBuffer.size();
+                                                    << "Input offset is too big. "
+                                                    << "Required size: " << requiredSize
+                                                    << ", Input buffer size: " << inputBuffer.size();
 
         const auto foundBlob = getNetInputInfo(name);
         const auto vpuLayout = foundBlob->second->getTensorDesc().getLayout();
@@ -139,9 +144,8 @@ void MyriadInferRequest::InferAsync() {
     }
 
     _executor->queueInference(_graphDesc, inputBuffer.data(),
-                              _inputInfo.totalSize, nullptr, 0);
+                            _inputInfo.totalSize, nullptr, 0);
 }
-
 static void copyBlobAccordingUpperBound(
     const Blob::Ptr& in,
     const Blob::Ptr& out) {
@@ -199,10 +203,22 @@ void MyriadInferRequest::GetResult() {
     const auto getVpuLayout = [&networkOutputs] (const std::string& name){
         const auto foundBlob = networkOutputs.find(name);
         IE_ASSERT(foundBlob != networkOutputs.end()) << "MyriadInferRequest::InferAsync()\n"
-                                                     << "Output [" << name << "] is not provided.";
+                                                    << "Output [" << name << "] is not provided.";
         return foundBlob->second->getTensorDesc().getLayout();
     };
-
+    if (_isNetworkConstant) {
+        for (const auto& output : _outputs) {
+            const auto& ieBlobName = output.first;
+            const auto& ieBlob = output.second;
+            IE_ASSERT(_constDatas.find(ieBlobName) != _constDatas.end()) <<
+            "Input [" << ieBlobName << "] is not provided.";
+            std::copy_n(
+                _constDatas[ieBlobName]->cbuffer().as<uint8_t *>(),
+                _constDatas[ieBlobName]->byteSize(),
+                ieBlob->buffer().as<uint8_t *>());
+        }
+        return;
+    }
     // For networks with only one output
     if (_outputInfo.offset.size() == 1) {
         const auto& it = _outputs.begin();
@@ -224,12 +240,12 @@ void MyriadInferRequest::GetResult() {
         const auto resultOffset = [&](const std::string& name) {
             const auto offset_it = _outputInfo.offset.find(name);
             IE_ASSERT(offset_it != _outputInfo.offset.end())  << "MyriadInferRequest::InferAsync()\n"
-                                                                       << "Output offset [" << name << "] error.";
+                                                                    << "Output offset [" << name << "] error.";
             const auto offset = vpu::checked_cast<size_t>(offset_it->second);
             IE_ASSERT(offset <= resultBuffer.size())  << "MyriadInferRequest::InferAsync()\n"
-                                                      << "Input offset is too big."
-                                                      << "Required offset: " << offset
-                                                      << "Result buffer size: " << resultBuffer.size();
+                                                    << "Input offset is too big."
+                                                    << "Required offset: " << offset
+                                                    << "Result buffer size: " << resultBuffer.size();
             return offset;
         };
 
diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h b/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h
index 9373f771fb9..194ce5fc990 100644
--- a/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h
+++ b/inference-engine/src/vpu/myriad_plugin/myriad_infer_request.h
@@ -34,6 +34,8 @@ class MyriadInferRequest : public InferenceEngine::IInferRequestInternal {
     GraphDesc _graphDesc;
     std::vector<uint8_t> resultBuffer;
     std::vector<uint8_t> inputBuffer;
+    std::map<std::string, ie::Blob::Ptr> _constDatas;
+    bool _isNetworkConstant;
 
 public:
     typedef std::shared_ptr<MyriadInferRequest> Ptr;
@@ -46,7 +48,9 @@ public:
                                 const std::vector<StageMetaInfo> &blobMetaData,
                                 const MyriadConfig &myriadConfig,
                                 const Logger::Ptr &log,
-                                const MyriadExecutorPtr &executor);
+                                const MyriadExecutorPtr &executor,
+                                std::map<std::string, ie::Blob::Ptr> constDatas,
+                                bool isNetworkConstant);
 
     void InferImpl() override;
     void InferAsync();
diff --git a/inference-engine/tests/functional/inference_engine/serialization/single_layer/prior_box.cpp b/inference-engine/tests/functional/inference_engine/serialization/single_layer/prior_box.cpp
new file mode 100644
index 00000000000..91def7d74c8
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/serialization/single_layer/prior_box.cpp
@@ -0,0 +1,98 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "shared_test_classes/single_layer/prior_box.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestDefinitions;
+
+namespace {
+    TEST_P(PriorBoxLayerTest, Serialize) {
+        Serialize();
+    }
+
+    const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::U16
+    };
+    const std::vector<std::vector<float>> min_sizes = {
+        {16.f, 32.f}
+    };
+
+    const std::vector<std::vector<float>> max_sizes = {
+        {256.f, 512.f}
+    };
+
+    const std::vector<std::vector<float>> aspect_ratios = {
+        {0.66f, 1.56f}
+    };
+
+    const std::vector<std::vector<float>> densities = {
+        {0.55f}
+    };
+
+    const std::vector<std::vector<float>> fixed_ratios = {
+        {0.88f}
+    };
+
+    const std::vector<std::vector<float>> fixed_sizes = {
+        {1.25f}
+    };
+
+    const std::vector<bool> clips = {
+        true, false
+    };
+
+    const std::vector<bool> flips = {
+        true, false
+    };
+
+    const std::vector<float> steps = {
+        1.0f, 2.0f
+    };
+
+    const std::vector<float> offsets = {
+        0.0f, 0.5f
+    };
+
+    const std::vector<std::vector<float>> variances = {
+        {2.22f, 3.14f}
+    };
+
+    const std::vector<bool> scale_all_sizes = {
+        true, false
+    };
+
+    const std::vector<size_t> inputShape = {128, 128};
+    const std::vector<size_t> imageShape = {50, 50};
+
+    const auto layerSpecificParams = ::testing::Combine(
+            ::testing::ValuesIn(min_sizes),
+            ::testing::ValuesIn(max_sizes),
+            ::testing::ValuesIn(aspect_ratios),
+            ::testing::ValuesIn(densities),
+            ::testing::ValuesIn(fixed_ratios),
+            ::testing::ValuesIn(fixed_sizes),
+            ::testing::ValuesIn(clips),
+            ::testing::ValuesIn(flips),
+            ::testing::ValuesIn(steps),
+            ::testing::ValuesIn(offsets),
+            ::testing::ValuesIn(variances),
+            ::testing::ValuesIn(scale_all_sizes));
+
+    INSTANTIATE_TEST_SUITE_P(smoke_PriorBox_Basic, PriorBoxLayerTest,
+                            ::testing::Combine(
+                                layerSpecificParams,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(inputShape),
+                                ::testing::Values(imageShape),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                            PriorBoxLayerTest::getTestCaseName);
+} // namespace
diff --git a/inference-engine/tests/functional/inference_engine/serialization/single_layer/space_to_depth.cpp b/inference-engine/tests/functional/inference_engine/serialization/single_layer/space_to_depth.cpp
new file mode 100644
index 00000000000..64cc72d06ca
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/serialization/single_layer/space_to_depth.cpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_layer/space_to_depth.hpp"
+
+#include <ngraph/opsets/opset3.hpp>
+
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+using namespace ngraph::opset3;
+
+namespace {
+TEST_P(SpaceToDepthLayerTest, Serialize) {
+    Serialize();
+}
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::U8,
+    InferenceEngine::Precision::I16,
+};
+
+const std::vector<SpaceToDepth::SpaceToDepthMode> modes = {
+    SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST,
+    SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST};
+
+const std::vector<std::vector<size_t>> inputShapesBS2 = {
+    {1, 1, 2, 2},    {1, 1, 4, 4},     {1, 1, 6, 6},    {2, 8, 6, 6},
+    {2, 4, 10, 8},   {1, 1, 2, 2, 2},  {1, 1, 4, 4, 4}, {1, 1, 6, 6, 6},
+    {2, 8, 6, 6, 6}, {2, 4, 10, 8, 12}};
+
+const auto SpaceToDepthBS2 = ::testing::Combine(
+    ::testing::ValuesIn(inputShapesBS2), ::testing::ValuesIn(inputPrecisions),
+    ::testing::ValuesIn(modes), ::testing::Values(1, 2),
+    ::testing::Values(CommonTestUtils::DEVICE_CPU));
+
+INSTANTIATE_TEST_CASE_P(
+    smoke_SpaceToDepthSerialization, SpaceToDepthLayerTest,
+    ::testing::Combine(::testing::ValuesIn(inputShapesBS2),
+                       ::testing::ValuesIn(inputPrecisions),
+                       ::testing::ValuesIn(modes), ::testing::Values(1, 2),
+                       ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    SpaceToDepthLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/inference_engine/transformations/convert_deformable_conv_v8_to_v1_test.cpp b/inference-engine/tests/functional/inference_engine/transformations/convert_deformable_conv_v8_to_v1_test.cpp
new file mode 100644
index 00000000000..b80c9ce6dbf
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/transformations/convert_deformable_conv_v8_to_v1_test.cpp
@@ -0,0 +1,160 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset8.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp>
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+
+TEST(TransformationTests, ConvertDeformableConv8to1) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        const Strides strides{1, 1};
+        const CoordinateDiff padding{0, 0};
+        const Strides dilations{1, 1};
+
+        const Shape input_shape{1, 1, 4, 4};
+        const Shape filter_shape{1, 1, 2, 2};
+        const Shape offsets_shape{1, 8, 3, 3};
+
+        auto data = std::make_shared<opset8::Parameter>(element::f32, input_shape);
+        auto filter = std::make_shared<opset8::Parameter>(element::f32, filter_shape);
+        auto offsets = std::make_shared<opset8::Parameter>(element::f32, offsets_shape);
+
+        auto deformable_conv = std::make_shared<opset8::DeformableConvolution>(data,
+                                                                               offsets,
+                                                                               filter,
+                                                                               strides,
+                                                                               padding,
+                                                                               padding,
+                                                                               dilations);
+
+        f = std::make_shared<Function>(NodeVector{deformable_conv}, ParameterVector{data, filter, offsets});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::ConvertDeformableConv8To1>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        const Strides strides{1, 1};
+        const CoordinateDiff padding{0, 0};
+        const Strides dilations{1, 1};
+
+        const Shape input_shape{1, 1, 4, 4};
+        const Shape filter_shape{1, 1, 2, 2};
+        const Shape offsets_shape{1, 8, 3, 3};
+
+        auto data = std::make_shared<opset1::Parameter>(element::f32, input_shape);
+        auto filter = std::make_shared<opset1::Parameter>(element::f32, filter_shape);
+        auto offsets = std::make_shared<opset1::Parameter>(element::f32, offsets_shape);
+
+        auto deformable_conv = std::make_shared<opset1::DeformableConvolution>(data,
+                                                                               offsets,
+                                                                               filter,
+                                                                               strides,
+                                                                               padding,
+                                                                               padding,
+                                                                               dilations);
+
+        f_ref = std::make_shared<Function>(NodeVector{deformable_conv}, ParameterVector{data, filter, offsets});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, ConvertDeformableConv8to1_mask) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        const Strides strides{1, 1};
+        const CoordinateDiff padding{0, 0};
+        const Strides dilations{1, 1};
+
+        const Shape input_shape{1, 1, 4, 4};
+        const Shape filter_shape{1, 1, 2, 2};
+        const Shape offsets_shape{1, 8, 3, 3};
+        const Shape mask_shape{1, 4, 3, 3};
+
+        auto data = std::make_shared<opset8::Parameter>(element::f32, input_shape);
+        auto filter = std::make_shared<opset8::Parameter>(element::f32, filter_shape);
+        auto offsets = std::make_shared<opset8::Parameter>(element::f32, offsets_shape);
+        auto mask = std::make_shared<opset8::Parameter>(element::f32, mask_shape);
+
+        auto deformable_conv = std::make_shared<opset8::DeformableConvolution>(data,
+                                                                               offsets,
+                                                                               filter,
+                                                                               mask,
+                                                                               strides,
+                                                                               padding,
+                                                                               padding,
+                                                                               dilations);
+
+        f = std::make_shared<Function>(NodeVector{deformable_conv}, ParameterVector{data, filter,
+                                                                                    mask, offsets});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::ConvertDeformableConv8To1>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    // mask input is provided, DeformableConvolution-8 must remain
+    ASSERT_EQ(count_ops_of_type<opset1::DeformableConvolution>(f), 0);
+    ASSERT_EQ(count_ops_of_type<opset8::DeformableConvolution>(f), 1);
+}
+
+TEST(TransformationTests, ConvertDeformableConv8to1_bilinear_interpolation_padding) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        const Strides strides{1, 1};
+        const CoordinateDiff padding{0, 0};
+        const Strides dilations{1, 1};
+
+        const Shape input_shape{1, 1, 4, 4};
+        const Shape filter_shape{1, 1, 2, 2};
+        const Shape offsets_shape{1, 8, 3, 3};
+
+        auto data = std::make_shared<opset8::Parameter>(element::f32, input_shape);
+        auto filter = std::make_shared<opset8::Parameter>(element::f32, filter_shape);
+        auto offsets = std::make_shared<opset8::Parameter>(element::f32, offsets_shape);
+
+        auto deformable_conv = std::make_shared<opset8::DeformableConvolution>(data,
+                                                                               offsets,
+                                                                               filter,
+                                                                               strides,
+                                                                               padding,
+                                                                               padding,
+                                                                               dilations,
+                                                                               op::PadType::EXPLICIT,
+                                                                               1,
+                                                                               1,
+                                                                               true);
+
+        f = std::make_shared<Function>(NodeVector{deformable_conv}, ParameterVector{data, filter, offsets});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::ConvertDeformableConv8To1>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    //  use_bilinear_interpolation_padding is true, DeformableConvolution-8 must remain
+    ASSERT_EQ(count_ops_of_type<opset1::DeformableConvolution>(f), 0);
+    ASSERT_EQ(count_ops_of_type<opset8::DeformableConvolution>(f), 1);
+}
diff --git a/inference-engine/tests/functional/inference_engine/transformations/pruning_test.cpp b/inference-engine/tests/functional/inference_engine/transformations/pruning_test.cpp
index 82c1fa6c9f2..d87a9271f8e 100644
--- a/inference-engine/tests/functional/inference_engine/transformations/pruning_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/pruning_test.cpp
@@ -35,10 +35,12 @@ Output<Node> create_constant_with_zeros(const Shape & shape, const Mask & mask)
             Coordinate coord_end(shape);
             coord_end[dim] = dim_value + 1;
 
+            NGRAPH_SUPPRESS_DEPRECATED_START
             CoordinateTransform iter(shape, coord_begin, coord_end);
             for (const Coordinate & coord : iter) {
                 values[iter.index(coord)] = 0;
             }
+            NGRAPH_SUPPRESS_DEPRECATED_END
         }
     }
     return std::make_shared<opset5::Constant>(element::f32, shape, values);
@@ -57,10 +59,12 @@ TEST(TransformationTests, InitMasksOutputChannel) {
     Shape weights_shape{6, 3, 3, 3};
 
     std::vector<double> values(shape_size(weights_shape), 1);
+    NGRAPH_SUPPRESS_DEPRECATED_START
     CoordinateTransform iter(weights_shape, {0, 1, 0, 0}, {6, 2, 3, 3});
     for (const Coordinate & coord : iter) {
         values[iter.index(coord)] = 0;
     }
+    NGRAPH_SUPPRESS_DEPRECATED_END
 
     auto weights = std::make_shared<opset5::Constant>(element::f32, weights_shape, values);
     pass::InitConstMask({1}).apply(weights);
diff --git a/inference-engine/tests/functional/inference_engine/transformations/simplify_shape_of_sub_graph.cpp b/inference-engine/tests/functional/inference_engine/transformations/simplify_shape_of_sub_graph.cpp
new file mode 100644
index 00000000000..d4b0e166573
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/transformations/simplify_shape_of_sub_graph.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+#include <queue>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <transformations/common_optimizations/simplify_shape_of_sub_graph.hpp>
+#include <transformations/init_node_info.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+
+using namespace testing;
+using namespace ngraph;
+
+auto gather = [](const std::shared_ptr<Node> input, std::vector<int64_t> indices, bool scalar = false) -> Output<Node> {
+    std::shared_ptr<Node> indices_node;
+    if (scalar)
+        indices_node = opset7::Constant::create(element::i64, {}, indices);
+    else
+        indices_node = opset7::Constant::create(element::i64, {indices.size()}, indices);
+    return std::make_shared<ngraph::opset7::Gather>(
+            input, indices_node, opset7::Constant::create(element::i64, {}, {0}));
+};
+
+TEST(TransformationTests, ShapeSubGraphTest) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+
+    Shape data_shape{1, 2, 3, 4};
+    {
+        auto data = std::make_shared<opset7::Parameter>(element::f32, data_shape);
+
+        auto shape_op_1 = std::make_shared<opset7::ShapeOf>(data);
+        auto gather_1 = gather(shape_op_1, {1}, true);
+        auto unsqueeze_1 = std::make_shared<opset7::Unsqueeze>(
+                gather_1, opset7::Constant::create(element::i64, {1}, {0}));
+
+        auto shape_op_2 = std::make_shared<opset7::ShapeOf>(data);
+        auto gather_2 = gather(shape_op_2, {2}, true);
+        auto unsqueeze_2 = std::make_shared<opset7::Unsqueeze>(
+                gather_2, opset7::Constant::create(element::i64, {1}, {0}));
+
+        auto const_1 = opset7::Constant::create(element::i64, Shape{1}, {2});
+        auto const_2 = opset7::Constant::create(element::i64, Shape{1}, {2});
+
+        auto concat = std::make_shared<opset7::Concat>(OutputVector{unsqueeze_1, unsqueeze_2, const_1, const_2}, 0);
+
+        auto reshape = std::make_shared<opset7::Reshape>(data, concat, false);
+        f = std::make_shared<Function>(NodeVector{reshape}, ParameterVector{data});
+        pass::Manager m;
+        m.register_pass<pass::InitNodeInfo>();
+        m.register_pass<pass::SimplifyShapeOfSubGraph>();
+        m.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+        ASSERT_EQ(reshape->get_output_partial_shape(0), PartialShape({2, 3, 2, 2}));
+    }
+    {
+        auto data = std::make_shared<opset7::Parameter>(element::f32, data_shape);
+
+        auto shape_op_1 = std::make_shared<opset7::ShapeOf>(data);
+        auto gather_1 = gather(shape_op_1, {1, 2});
+
+        auto const_1 = opset7::Constant::create(element::i64, Shape{1}, {2});
+        auto const_2 = opset7::Constant::create(element::i64, Shape{1}, {2});
+
+        auto concat = std::make_shared<opset7::Concat>(OutputVector{gather_1, const_1, const_2}, 0);
+
+        auto reshape = std::make_shared<opset7::Reshape>(data, concat, false);
+        f_ref = std::make_shared<Function>(NodeVector{reshape}, ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref, true);
+    ASSERT_TRUE(res.first) << res.second;
+}
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
index 11a2c41a68e..0c6a8763844 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
@@ -65,9 +65,11 @@ const std::map<ActivationTypes, std::vector<std::vector<float>>> activationTypes
 
 // List of operations that should be tested also with integer precision
 const std::map<ActivationTypes, std::vector<std::vector<float>>> intActivationTypes = {
+        {Atan,                  {}},
         {Negative,              {}},
         {Ceiling,               {}},
         {Cos,                   {}},
+        {Sinh,                  {}},
         {Sqrt,                  {}},
         {Tanh,                  {}},
 };
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/deformable_convolution.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/deformable_convolution.cpp
index 7bf74fb3a82..f83225145da 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/deformable_convolution.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/deformable_convolution.cpp
@@ -89,4 +89,30 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(std::vector<size_t>({1, 4, 224, 224})),
         ::testing::Values(CommonTestUtils::DEVICE_CPU)),
     DeformableConvolutionLayerTest::getTestCaseName);
+
+/* ============= Single Test Case ============= */
+const std::vector<std::vector<size_t>> single_deform_vals = {{1, 54, 28, 28}};
+const std::vector<std::vector<size_t>> single_kernel = {{1, 3, 3, 3}};
+const std::vector<size_t> single_deform_groups = {3};
+
+const auto deformableConv2DParams_SingleTestCase = ::testing::Combine(
+    ::testing::ValuesIn(single_deform_vals),
+    ::testing::ValuesIn(single_kernel), ::testing::ValuesIn(strides),
+    ::testing::ValuesIn(padBegins), ::testing::ValuesIn(padEnds),
+    ::testing::ValuesIn(dilations), ::testing::ValuesIn(groups),
+    ::testing::ValuesIn(single_deform_groups), ::testing::ValuesIn(numOutChannels),
+    ::testing::Values(ngraph::op::PadType::EXPLICIT));
+
+INSTANTIATE_TEST_SUITE_P(
+    smoke_DeformableConvolution2D_SingleTestCase, DeformableConvolutionLayerTest,
+    ::testing::Combine(
+        deformableConv2DParams_SingleTestCase, ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(InferenceEngine::Layout::ANY),
+        ::testing::Values(std::vector<size_t>({1, 3, 30, 30})),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+    DeformableConvolutionLayerTest::getTestCaseName);
+
 }  // namespace
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/prior_box.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/prior_box.cpp
new file mode 100644
index 00000000000..d4c56aa0078
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/prior_box.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/prior_box.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestDefinitions;
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::I32,
+    InferenceEngine::Precision::U16};
+const std::vector<std::vector<float>> min_sizes = {
+    {256.0f}};
+
+const std::vector<std::vector<float>> max_sizes = {
+    {315.0f}};
+
+const std::vector<std::vector<float>> aspect_ratios = {
+    {2.0f}};
+
+const std::vector<std::vector<float>> densities = {
+    {1.0f}};
+
+const std::vector<std::vector<float>> fixed_ratios = {
+    {}};
+
+const std::vector<std::vector<float>> fixed_sizes = {
+    {}};
+
+const std::vector<bool> clips = {
+    false, true};
+
+const std::vector<bool> flips = {
+    false, true};
+
+const std::vector<float> steps = {
+    1.0f,
+};
+
+const std::vector<float> offsets = {
+    0.0f,
+};
+
+const std::vector<std::vector<float>> variances = {
+    {}};
+
+const std::vector<bool> scale_all_sizes = {
+    false, true};
+
+const std::vector<size_t> inputShape = {300, 300};
+const std::vector<size_t> imageShape = {32, 32};
+
+const auto layerSpecificParams = ::testing::Combine(
+    ::testing::ValuesIn(min_sizes),
+    ::testing::ValuesIn(max_sizes),
+    ::testing::ValuesIn(aspect_ratios),
+    ::testing::ValuesIn(densities),
+    ::testing::ValuesIn(fixed_ratios),
+    ::testing::ValuesIn(fixed_sizes),
+    ::testing::ValuesIn(clips),
+    ::testing::ValuesIn(flips),
+    ::testing::ValuesIn(steps),
+    ::testing::ValuesIn(offsets),
+    ::testing::ValuesIn(variances),
+    ::testing::ValuesIn(scale_all_sizes));
+
+INSTANTIATE_TEST_SUITE_P(smoke_PriorBox_Basic, PriorBoxLayerTest,
+                            ::testing::Combine(
+                                layerSpecificParams,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(inputShape),
+                                ::testing::Values(imageShape),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                            PriorBoxLayerTest::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp
index f2e5a76e33c..c7342e95517 100644
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/activation.cpp
@@ -156,4 +156,4 @@ const auto basicCases5D = ::testing::Combine(
 
 INSTANTIATE_TEST_SUITE_P(smoke_Activation5D_Eltwise_CPU_BF16, ActivationLayerCPUTest, basicCases5D, ActivationLayerCPUTest::getTestCaseName);
 } // namespace
-} // namespace CPULayerTestsDefinitions
\ No newline at end of file
+} // namespace CPULayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
index a59ad83eaed..589ec41c83d 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
@@ -9,28 +9,12 @@
 
 std::vector<std::string> disabledTestPatterns() {
     return {
-        ".*TensorNamesTest\\.CheckAddOutput.*",
         // TODO: FIX BUG 31661
         // TODO: support InferRequest in GNAPlugin
         ".*InferRequestTests\\.canRun3AsyncRequestsConsistentlyFromThreadsWithoutWait.*",
-        // TODO: FIX BUG 23740
-        ".*InferRequestTests\\.CanCreateTwoExeNetworks.*",
-        // TODO: FIX BUG 26702
-        ".*InferRequestTests\\.FailedAsyncInferWithNegativeTimeForWait.*",
         // TODO: FIX BUG 23741
         ".*InferRequestTests\\.canRun3SyncRequestsConsistentlyFromThreads.*",
-        // TODO: FIX BUG 23742
-        ".*InferRequestTests\\.canWaitWithotStartAsync.*",
-        // TODO: FIX BUG 23743
-        ".*InferRequestTests\\.returnDeviceBusyOnSetBlobAfterAsyncInfer.*",
-        ".*InferRequestTests\\.returnDeviceBusyOnGetBlobAfterAsyncInfer.*",
-        ".*InferRequestTests\\.returnDeviceBusyOnGetPerformanceCountAfterAsyncInfer.*",
-        ".*InferRequestTests\\.returnDeviceBusyOnStartInferAfterAsyncInfer.*",
-        ".*InferRequestTests\\.returnDeviceBusyOnGetUserDataAfterAsyncInfer.*",
-        ".*InferRequestTests\\.returnDeviceBusyOnSetUserDataAfterAsyncInfer.*",
-        // TODO: FIX BUG 31661
-        ".*InferRequestTests\\.canStartSeveralAsyncInsideCompletionCallbackNoSafeDtorWithoutWait.*",
-        // TODO: FIX BUG 31661
+        // TODO: FIX BUG 59041
         ".*Behavior.*CallbackThrowException.*",
         // TODO: FIX BUG 32210
         R"(.*ActivationLayerTest.CompareWithRefs/(Sigmoid|Tanh|Exp|Log).*)",
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather.cpp
index 6af03cf397d..7cb4e1427e6 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/gather.cpp
@@ -154,6 +154,34 @@ INSTANTIATE_TEST_SUITE_P(
         Gather7LayerTest::getTestCaseName
 );
 
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Gather7Axes4i4b1,
+        Gather8LayerTest,
+        GatherAxes4i4b1,
+        Gather8LayerTest::getTestCaseName
+);
+
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Gather7Axes4i4b2,
+        Gather8LayerTest,
+        GatherAxes4i4b1,
+        Gather8LayerTest::getTestCaseName
+);
+
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Gather7Axes4i8b1,
+        Gather8LayerTest,
+        GatherAxes4i8b1,
+        Gather8LayerTest::getTestCaseName
+);
+
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Gather7Axes4i8b2,
+        Gather8LayerTest,
+        GatherAxes4i8b2,
+        Gather8LayerTest::getTestCaseName
+);
+
 const std::vector<std::vector<int>> indices = {
         std::vector<int>{0, 3, 2, 1},
 };
diff --git a/inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_broadcast.cpp b/inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_broadcast.cpp
index 78de9260c2d..c58f283c6f7 100644
--- a/inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_broadcast.cpp
+++ b/inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_broadcast.cpp
@@ -74,7 +74,6 @@ protected:
         const auto tensorWithTargetShapeParam = std::make_shared<ngraph::opset3::Parameter>(tensorType, targetShape);
 
         const auto shapeOfNode = std::make_shared<ngraph::opset3::ShapeOf>(tensorWithTargetShapeParam, shapeType);
-        shapeOfNode->set_is_foldable(false);
 
         ngraph::ParameterVector params{tensorParam, tensorWithTargetShapeParam};
 
@@ -197,7 +196,6 @@ protected:
         const auto tensorWithTargetShapeParam = std::make_shared<ngraph::opset5::Parameter>(shapeType, targetShape);
 
         const auto shapeOfNode = std::make_shared<ngraph::opset5::ShapeOf>(tensorWithTargetShapeParam, shapeType);
-        shapeOfNode->set_is_foldable(false);
 
         ngraph::ParameterVector params{tensorParam, tensorWithTargetShapeParam};
 
diff --git a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp
index bdb17b09505..d67ae3e6270 100644
--- a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp
@@ -23,8 +23,6 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*IEClassGetAvailableDevices.*)",
         // TODO: Issue: 40473
         R"(.*TopKLayerTest.*mode=min.*sort=index.*)",
-        // TODO: Issue: 40961
-        R"(.*(ConstantResultSubgraphTest).*)",
         // TODO: Issue: 42828
         R"(.*DSR_NonMaxSuppression.*NBoxes=(5|20|200).*)",
         // TODO: Issue: 42721
diff --git a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/subgraph_tests/constant_result.cpp b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/subgraph_tests/constant_result.cpp
index 919fcdef3d0..2b47c8eb466 100644
--- a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/subgraph_tests/constant_result.cpp
+++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/subgraph_tests/constant_result.cpp
@@ -23,15 +23,7 @@ const std::vector<SizeVector> shapes = {
 };
 
 const std::vector<Precision> precisions = {
-    Precision::U8,
-    Precision::I8,
-    Precision::U16,
-    Precision::I16,
-    Precision::I32,
-    Precision::U64,
-    Precision::I64,
-    Precision::FP32,
-    Precision::BOOL
+    Precision::FP32
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Check, ConstantResultSubgraphTest,
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather.hpp
index 49191b69553..69278816840 100644
--- a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/gather.hpp
@@ -16,4 +16,8 @@ TEST_P(Gather7LayerTest, CompareWithRefs) {
     Run();
 };
 
+TEST_P(Gather8LayerTest, CompareWithRefs) {
+    Run();
+};
+
 }  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/prior_box.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/prior_box.hpp
new file mode 100644
index 00000000000..2b1ab4780ab
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/prior_box.hpp
@@ -0,0 +1,15 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/single_layer/prior_box.hpp"
+
+namespace LayerTestDefinitions {
+
+TEST_P(PriorBoxLayerTest, CompareWithRefs) {
+    Run();
+}
+
+}  // namespace LayerTestDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/gather.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/gather.hpp
index 5fd19bbacda..abf7c2caa99 100644
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/gather.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/gather.hpp
@@ -63,4 +63,13 @@ protected:
     void SetUp() override;
 };
 
+class Gather8LayerTest : public testing::WithParamInterface<gather7ParamsTuple>,
+                         virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<gather7ParamsTuple>& obj);
+
+protected:
+    void SetUp() override;
+};
+
 }  // namespace LayerTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp
new file mode 100644
index 00000000000..15fcb6d1127
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/prior_box.hpp
@@ -0,0 +1,80 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <tuple>
+#include <string>
+#include <map>
+#include <memory>
+#include <set>
+#include <functional>
+#include <gtest/gtest.h>
+
+
+#include "ie_core.hpp"
+#include "ie_precision.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace LayerTestDefinitions {
+using priorBoxSpecificParams =  std::tuple<
+        std::vector<float>, // min_size
+        std::vector<float>, // max_size
+        std::vector<float>, // aspect_ratio
+        std::vector<float>, // density
+        std::vector<float>, // fixed_ratio
+        std::vector<float>, // fixed_size
+        bool,               // clip
+        bool,               // flip
+        float,              // step
+        float,              // offset
+        std::vector<float>, // variance
+        bool>;              // scale_all_sizes
+
+typedef std::tuple<
+        priorBoxSpecificParams,
+        InferenceEngine::Precision,   // net precision
+        InferenceEngine::Precision,   // Input precision
+        InferenceEngine::Precision,   // Output precision
+        InferenceEngine::Layout,      // Input layout
+        InferenceEngine::Layout,      // Output layout
+        InferenceEngine::SizeVector,  // input shape
+        InferenceEngine::SizeVector,  // image shape
+        std::string> priorBoxLayerParams;
+
+class PriorBoxLayerTest
+    : public testing::WithParamInterface<priorBoxLayerParams>,
+      virtual public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<priorBoxLayerParams>& obj);
+protected:
+    InferenceEngine::SizeVector inputShapes;
+    InferenceEngine::SizeVector imageShapes;
+    InferenceEngine::Precision netPrecision;
+    std::vector<float> min_size;
+    std::vector<float> max_size;
+    std::vector<float> aspect_ratio;
+    std::vector<float> density;
+    std::vector<float> fixed_ratio;
+    std::vector<float> fixed_size;
+    std::vector<float> variance;
+    float step;
+    float offset;
+    bool clip;
+    bool flip;
+    bool scale_all_sizes;
+
+    void SetUp() override;
+};
+
+} // namespace LayerTestDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/gather.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/gather.cpp
index 9f57e1d1be1..84965158e13 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/gather.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/gather.cpp
@@ -93,4 +93,47 @@ void Gather7LayerTest::SetUp() {
     function = std::make_shared<ngraph::Function>(results, functionParams, "gather");
 }
 
+std::string Gather8LayerTest::getTestCaseName(const testing::TestParamInfo<gather7ParamsTuple>& obj) {
+    std::tuple<int, int> axis_batchIdx;
+    std::vector<int> indices;
+    std::vector<size_t> indicesShape, inputShape;
+    InferenceEngine::Precision netPrecision;
+    InferenceEngine::Precision inPrc, outPrc;
+    InferenceEngine::Layout inLayout, outLayout;
+    std::string targetName;
+    std::tie(inputShape, indicesShape, axis_batchIdx, netPrecision, inPrc, outPrc, inLayout, outLayout, targetName) = obj.param;
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "axis=" << std::get<0>(axis_batchIdx) << "_";
+    result << "batchIdx=" << std::get<1>(axis_batchIdx) << "_";
+    result << "indicesShape=" << CommonTestUtils::vec2str(indicesShape) << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "inPRC=" << inPrc.name() << "_";
+    result << "outPRC=" << outPrc.name() << "_";
+    result << "inL=" << inLayout << "_";
+    result << "outL=" << outLayout << "_";
+    result << "trgDev=" << targetName << "_";
+    return result.str();
+}
+
+void Gather8LayerTest::SetUp() {
+    std::tuple<int, int> axis_batchIdx;
+    std::vector<size_t> indicesShape;
+    std::vector<size_t> inputShape;
+    InferenceEngine::Precision netPrecision;
+    std::tie(inputShape, indicesShape, axis_batchIdx, netPrecision, inPrc, outPrc, inLayout, outLayout, targetDevice) = GetParam();
+    int axis = std::get<0>(axis_batchIdx);
+    int batchIdx = std::get<1>(axis_batchIdx);
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto functionParams = ngraph::builder::makeParams(ngPrc, { inputShape });
+    auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(functionParams));
+    auto indicesNode = ngraph::builder::makeConstant<int>(ngraph::element::i64, indicesShape, {}, true,
+                                                          inputShape[axis < 0 ? axis + inputShape.size() : axis] - 1,
+                                                          1 - static_cast<int>(inputShape[axis < 0 ? axis + inputShape.size() : axis]));
+    auto axisNode = ngraph::opset8::Constant::create(ngraph::element::i64, ngraph::Shape({}), { axis });
+    auto gather = std::make_shared<ngraph::opset8::Gather>(paramOuts[0], indicesNode, axisNode, batchIdx);
+    ngraph::ResultVector results{ std::make_shared<ngraph::opset8::Result>(gather) };
+    function = std::make_shared<ngraph::Function>(results, functionParams, "gather");
+}
+
 }  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp b/inference-engine/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp
new file mode 100644
index 00000000000..f3b95141ae0
--- /dev/null
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp
@@ -0,0 +1,91 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/single_layer/prior_box.hpp"
+
+namespace LayerTestDefinitions {
+std::string PriorBoxLayerTest::getTestCaseName(const testing::TestParamInfo<priorBoxLayerParams>& obj) {
+    InferenceEngine::Precision netPrecision;
+    InferenceEngine::Precision inPrc, outPrc;
+    InferenceEngine::Layout inLayout, outLayout;
+    InferenceEngine::SizeVector inputShapes, imageShapes;
+    std::string targetDevice;
+    priorBoxSpecificParams specParams;
+    std::tie(specParams,
+        netPrecision,
+        inPrc, outPrc, inLayout, outLayout,
+        inputShapes,
+        imageShapes,
+        targetDevice) = obj.param;
+
+    std::vector<float> min_size, max_size, aspect_ratio, density, fixed_ratio, fixed_size, variance;
+    float step, offset;
+    bool clip, flip, scale_all_sizes;
+    std::tie(min_size, max_size, aspect_ratio,
+             density, fixed_ratio, fixed_size, clip,
+             flip, step, offset, variance, scale_all_sizes) = specParams;
+
+    std::ostringstream result;
+    const char separator = '_';
+    result << "IS="      << CommonTestUtils::vec2str(inputShapes) << separator;
+    result << "imageS="  << CommonTestUtils::vec2str(imageShapes) << separator;
+    result << "netPRC="  << netPrecision.name()   << separator;
+    result << "inPRC="   << inPrc.name() << separator;
+    result << "outPRC="  << outPrc.name() << separator;
+    result << "inL="     << inLayout << separator;
+    result << "outL="    << outLayout << separator;
+    result << "min_s=" << CommonTestUtils::vec2str(min_size) << separator;
+    result << "max_s=" << CommonTestUtils::vec2str(max_size)<< separator;
+    result << "asp_r=" << CommonTestUtils::vec2str(aspect_ratio)<< separator;
+    result << "dens=" << CommonTestUtils::vec2str(density)<< separator;
+    result << "fix_r=" << CommonTestUtils::vec2str(fixed_ratio)<< separator;
+    result << "fix_s=" << CommonTestUtils::vec2str(fixed_size)<< separator;
+    result << "var=" << CommonTestUtils::vec2str(variance)<< separator;
+    result << "step=" << step << separator;
+    result << "off=" << offset << separator;
+    result << "clip=" << clip << separator;
+    result << "flip=" << flip<< separator;
+    result << "scale_all=" << scale_all_sizes << separator;
+    result << "trgDev=" << targetDevice;
+
+    return result.str();
+}
+
+void PriorBoxLayerTest::SetUp() {
+    priorBoxSpecificParams specParams;
+    std::tie(specParams, netPrecision,
+             inPrc, outPrc, inLayout, outLayout,
+             inputShapes, imageShapes, targetDevice) = GetParam();
+
+    std::tie(min_size, max_size, aspect_ratio,
+             density, fixed_ratio, fixed_size, clip,
+             flip, step, offset, variance, scale_all_sizes) = specParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, {inputShapes, imageShapes});
+
+    ngraph::op::PriorBoxAttrs attributes;
+    attributes.min_size = min_size;
+    attributes.max_size = max_size;
+    attributes.aspect_ratio = aspect_ratio;
+    attributes.density = density;
+    attributes.fixed_ratio = fixed_ratio;
+    attributes.fixed_size = fixed_size;
+    attributes.variance = variance;
+    attributes.step = step;
+    attributes.offset = offset;
+    attributes.clip = clip;
+    attributes.flip = flip;
+
+    auto shape_of_1 = std::make_shared<ngraph::opset3::ShapeOf>(params[0]);
+    auto shape_of_2 = std::make_shared<ngraph::opset3::ShapeOf>(params[1]);
+    auto priorBox = std::make_shared<ngraph::op::PriorBox>(
+        shape_of_1,
+        shape_of_2,
+        attributes);
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(priorBox)};
+    function = std::make_shared <ngraph::Function>(results, params, "PriorBoxFunction");
+}
+} // namespace LayerTestDefinitions
diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py
index cb281578c88..4c3cf62280b 100644
--- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py
+++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_tests_summary/utils/constants.py
@@ -60,9 +60,10 @@ VERIFIED_OP_REFERENCES = [
     'NonMaxSuppression-4',
     'NonMaxSuppression-5',
     'NonZero-3',
-    'PSROIPooling-1',
+    'PriorBox-1',
     'Proposal-1',
     'Proposal-4',
+    'PSROIPooling-1',
     'RNNSequence-5',
     'ROIAlign-3',
     'ROIPooling-2',
@@ -83,11 +84,13 @@ VERIFIED_OP_REFERENCES = [
     'ReorgYOLO-2',
     'Result-1'
     'Round-5',
+    'SpaceToDepth-1',
     'ScatterNDUpdate-4',
     'ShapeOf-1',
     'ShapeOf-3',
     'Sigmoid-1',
     'Sin-1',
+    'Sinh-1'
     'SoftPlus-4',
     'Softmax-1',
     'Split-1',
diff --git a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
index 3ed4058808a..d1e94b78f1e 100644
--- a/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
+++ b/inference-engine/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
@@ -14,6 +14,7 @@
 #include <ngraph/opsets/opset5.hpp>
 #include <ngraph/opsets/opset6.hpp>
 #include <ngraph/opsets/opset7.hpp>
+#include <ngraph/opsets/opset8.hpp>
 
 #include "ngraph_functions/utils/data_utils.hpp"
 
diff --git a/inference-engine/tests/unit/gna/ngraph/transformations/gna_remove_extra_reshapes.cpp b/inference-engine/tests/unit/gna/ngraph/transformations/gna_remove_extra_reshapes.cpp
new file mode 100644
index 00000000000..51f6828d44c
--- /dev/null
+++ b/inference-engine/tests/unit/gna/ngraph/transformations/gna_remove_extra_reshapes.cpp
@@ -0,0 +1,89 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "transformations/remove_extra_reshapes.hpp"
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/init_node_info.hpp>
+
+namespace testing {
+
+TEST(TransformationTests, RemoveExtraReshapesTestReshapeNotEqualInputOutput) {
+    std::shared_ptr<ngraph::Function> func(nullptr), reference_func(nullptr);
+    const ngraph::Shape data_shape{1, 3, 64, 64};
+
+    {
+        auto input_params = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, data_shape);
+        auto new_shape = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{3}, {1, 3, 64 * 64});
+        auto reshape_operation = std::make_shared<ngraph::opset7::Reshape>(input_params, new_shape, true);
+        auto max_pool_operation = std::make_shared<ngraph::opset7::MaxPool>(reshape_operation,
+                                                                            ngraph::Strides{1},
+                                                                            ngraph::Shape{0},
+                                                                            ngraph::Shape{0},
+                                                                            ngraph::Shape{3});
+        auto result = std::make_shared<ngraph::opset7::Result>(max_pool_operation);
+        func = std::make_shared<ngraph::Function>(ngraph::ResultVector{result},
+                                                  ngraph::ParameterVector{input_params});
+
+        reference_func = ngraph::clone_function(*func);
+
+        ngraph::pass::Manager m;
+        m.register_pass<ngraph::pass::InitNodeInfo>();
+        m.register_pass<GNAPluginNS::RemoveExtraReshapes>();
+        m.run_passes(func);
+        ASSERT_NO_THROW(check_rt_info(func));
+    }
+
+    const FunctionsComparator func_comparator = FunctionsComparator::with_default().enable(FunctionsComparator::ATTRIBUTES);
+    const FunctionsComparator::Result result = func_comparator(func, reference_func);
+    ASSERT_TRUE(result.valid);
+}
+
+TEST(TransformationTests, RemoveExtraReshapesTestReshapeEqualInputOutput) {
+    std::shared_ptr<ngraph::Function> func(nullptr), reference_func(nullptr);
+    const ngraph::Shape data_shape{1, 3, 64, 64};
+
+    {
+        auto input_params = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, data_shape);
+        auto new_shape = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4}, {1, 3, 64, 64});
+        auto reshape_operation = std::make_shared<ngraph::opset7::Reshape>(input_params, new_shape, true);
+        auto max_pool_operation = std::make_shared<ngraph::opset7::MaxPool>(reshape_operation,
+                                                                            ngraph::Strides{1, 1},
+                                                                            ngraph::Shape{0, 0},
+                                                                            ngraph::Shape{0, 0},
+                                                                            ngraph::Shape{3, 3});
+        auto result = std::make_shared<ngraph::opset7::Result>(max_pool_operation);
+        func = std::make_shared<ngraph::Function>(ngraph::ResultVector{result},
+                                                  ngraph::ParameterVector{input_params});
+
+        ngraph::pass::Manager m;
+        m.register_pass<ngraph::pass::InitNodeInfo>();
+        m.register_pass<GNAPluginNS::RemoveExtraReshapes>();
+        m.run_passes(func);
+        ASSERT_NO_THROW(check_rt_info(func));
+    }
+
+    {
+        auto input_params = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, data_shape);
+        auto max_pool_operation = std::make_shared<ngraph::opset7::MaxPool>(input_params,
+                                                                            ngraph::Strides{1, 1},
+                                                                            ngraph::Shape{0, 0},
+                                                                            ngraph::Shape{1, 1},
+                                                                            ngraph::Shape{4, 4});
+        auto result = std::make_shared<ngraph::opset7::Result>(max_pool_operation);
+        reference_func = std::make_shared<ngraph::Function>(ngraph::ResultVector{result},
+                                                            ngraph::ParameterVector{input_params});
+    }
+
+    const FunctionsComparator func_comparator = FunctionsComparator::with_default();
+    const FunctionsComparator::Result result = func_comparator(func, reference_func);
+    ASSERT_TRUE(result.valid);
+}
+
+} // namespace testing
diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/primitives/gather.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/primitives/gather.hpp
index 483a1a40b31..18bc947b1e3 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn/primitives/gather.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/primitives/gather.hpp
@@ -35,6 +35,7 @@ struct gather : public primitive_base<gather> {
     /// @param axis Gathering axis.
     /// @param output_shape Output shape.
     /// @param batch_dim Batch_dim
+    /// @param support_neg_ind Support negative indexes
     gather(const primitive_id& id,
            const primitive_id& dict,
            const primitive_id& idx,
@@ -42,8 +43,11 @@ struct gather : public primitive_base<gather> {
            const format& output_format,
            const tensor& output_shape,
            const int64_t batch_dim = 0,
-           const padding& output_padding = padding())
-        : primitive_base(id, {dict, idx}, output_padding), axis(axis), output_format(output_format), output_shape(output_shape), batch_dim(batch_dim) {}
+           const bool support_neg_ind = false,
+           const padding& output_padding = padding()
+           )
+        : primitive_base(id, {dict, idx}, output_padding), axis(axis), output_format(output_format),
+                         output_shape(output_shape), batch_dim(batch_dim), support_neg_ind(support_neg_ind) {}
 
     /// @brief Gathering axis
     gather_axis axis;
@@ -53,6 +57,8 @@ struct gather : public primitive_base<gather> {
     tensor output_shape;
     /// @brief Gathering batch_dim
     int64_t batch_dim;
+    /// @brief Support negative indexes
+    bool support_neg_ind;
 };
 /// @}
 /// @}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp
index 23d897d32e7..28686e2cb35 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp
@@ -79,6 +79,10 @@ static int64_t GetGatherBatchDim(const gather_params& params) {
         return params.batch_dim;
 }
 
+static inline std::string GetGatherMaxIndexDim(const gather_params& params) {
+    return std::to_string(params.inputs[0].GetDims().at(params.inputs[0].GetDims().size() - GetGatherChannelIndex(params) - 1).v);
+}
+
 static inline std::string GetOrderString(std::vector<std::string>& order) {
     std::string order_str = order[0];
     for (size_t i = 1; i < order.size(); i++)
@@ -168,6 +172,8 @@ JitConstants GatherKernelRef::GetJitConstants(const gather_params& params) const
 
     jit.AddConstant(MakeJitConstant("DICTIONARY_INDEX_ORDER", GetDictionaryIndexOrder(params, GetGatherChannelIndex(params))));
     jit.AddConstant(MakeJitConstant("INDICES_INDEX_ORDER", GetIndecesIdxOrder(params, GetGatherChannelIndex(params), GetGatherBatchDim(params))));
+    if (params.support_neg_ind)
+        jit.AddConstant(MakeJitConstant("INDEX_DIM", GetGatherMaxIndexDim(params)));
 
     if (!params.fused_ops.empty()) {
         std::vector<std::string> idx_order = GetOrder(params.inputs[0].GetDims().size());
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h
index 38f058e4e67..623912bbdbf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.h
@@ -11,10 +11,11 @@ namespace kernel_selector {
 // gather_params
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 struct gather_params : public base_params {
-    gather_params() : base_params(KernelType::GATHER), axis(GatherAxis::BATCH), batch_dim(0) {}
+    gather_params() : base_params(KernelType::GATHER), axis(GatherAxis::BATCH), batch_dim(0), support_neg_ind(false) {}
 
     GatherAxis axis;
     int64_t batch_dim;
+    bool support_neg_ind;
     virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
 };
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl
index 48d97709bd0..978edce3356 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gather_ref.cl
@@ -5,7 +5,19 @@
 #include "include/data_types.cl"
 #include "include/fetch_data.cl"
 
-#define INPUT_AXIS_INDEX (uint)indices[indices_idx]
+#ifdef INDEX_DIM
+inline uint FUNC(get_positive_index)(int in)
+{
+    if(in < 0)
+        return in + INDEX_DIM;
+    else
+        return in;
+}
+#define INPUT_AXIS_INDEX (uint)FUNC_CALL(get_positive_index)(indices[indices_idx])
+#else
+#define INPUT_AXIS_INDEX (uint)(indices[indices_idx])
+#endif
+
 #define GET_DICTIONARY_INDEX(idx_order) INPUT0_GET_INDEX(idx_order)
 #define GET_INDICES_INDEX(idx_order) INPUT1_GET_INDEX(idx_order)
 #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX)(idx_order)
diff --git a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
index 5182fd14e48..cc7d64f71ec 100644
--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@@ -16,23 +16,29 @@ const char *debug_configuration::prefix = "GPU_Debug: ";
 static void print_option(std::string option_name, std::string option_value) {
     GPU_DEBUG_COUT << "Config " << option_name << " = " << option_value << std::endl;
 }
+
+static void get_int_env(const std::string &var, int &val) {
+    if (const auto env_var = std::getenv(var.c_str())) {
+        val = std::stoi(env_var);
+        print_option(var, std::to_string(val));
+    }
+}
+
+static void get_str_env(const std::string &var, std::string &val) {
+    if (const auto env_var = std::getenv(var.c_str())) {
+        val = env_var;
+        print_option(var, val);
+    }
+}
+
 #endif
 
 debug_configuration::debug_configuration()
         : verbose(0)
         , dump_graphs(std::string()) {
 #ifdef GPU_DEBUG_CONFIG
-    const std::string OV_GPU_VERBOSE("OV_GPU_Verbose");
-    const std::string OV_GPU_DUMP_GRAPHS("OV_GPU_DumpGraphs");
-    if (const auto env_var = std::getenv(OV_GPU_VERBOSE.c_str())) {
-        verbose = std::stoi(env_var);
-        print_option(OV_GPU_VERBOSE, std::to_string(verbose));
-    }
-
-    if (const auto env_var = std::getenv(OV_GPU_DUMP_GRAPHS.c_str())) {
-        dump_graphs = env_var;
-        print_option(OV_GPU_DUMP_GRAPHS, dump_graphs);
-    }
+    get_int_env("OV_GPU_Verbose", verbose);
+    get_str_env("OV_GPU_DumpGraphs", dump_graphs);
 #endif
 }
 
diff --git a/inference-engine/thirdparty/clDNN/runtime/kernels_cache.cpp b/inference-engine/thirdparty/clDNN/runtime/kernels_cache.cpp
index 014486841f0..2e7e1272d06 100644
--- a/inference-engine/thirdparty/clDNN/runtime/kernels_cache.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/kernels_cache.cpp
@@ -5,6 +5,7 @@
 #include "kernels_factory.hpp"
 #include "kernels_cache.hpp"
 #include "ocl/ocl_engine.hpp"
+#include "cldnn/runtime/debug_configuration.hpp"
 
 #include <algorithm>
 #include <cassert>
@@ -372,6 +373,10 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
             dump_file << "*/\n";
     }
     if (!err_log.empty()) {
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->verbose) {
+            std::cout << err_log << std::endl;
+        }
         throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
     }
 }
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp
index e0d51b3f000..d9791de4f2d 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/gather_gpu.cpp
@@ -49,6 +49,7 @@ public:
 
         gather_params.axis = convert_axis(arg.get_primitive()->axis);
         gather_params.batch_dim = size_t(arg.get_primitive()->batch_dim);
+        gather_params.support_neg_ind = arg.get_primitive()->support_neg_ind;
 
         gather_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
 
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp
index c0387763873..a6471f4b13a 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/gather_gpu_test.cpp
@@ -12,6 +12,136 @@
 using namespace cldnn;
 using namespace ::tests;
 
+TEST(gather8_gpu_fp16, d323_axisY_bdim_m1) {
+    //  Dictionary : 3x2x3x4x2
+    //  Indexes : 3x2x3x1
+    //  Axis : 3
+    //  batch_dim : -1
+    //  Output : 3x2x3x3x2
+    //  Input values in fp16
+
+    //  Indexes:
+    //  0.f, 0.f, 0.f, 3.f, -3.f, 0.f, 1.f, -3.f, 1.f, -2.f, 0.f, 3.f, -1.f, 1.f, 0.f, 2.f, 0.f, 1.f
+    //
+    //  Dictionary:
+    //  1.f   2.f   3.f   4.f   5.f   6.f   7.f   8.f   9.f   10.f  11.f  12.f  13.f  14.f  15.f  16.f  17.f  18.f
+    //  19.f  20.f  21.f  22.f  23.f  24.f  25.f  26.f  27.f  28.f  29.f  30.f  31.f  32.f  33.f  34.f  35.f  36.f
+    //  37.f  38.f  39.f  40.f  41.f  42.f  43.f  44.f  45.f  46.f  47.f  48.f  49.f  50.f  51.f  52.f  53.f  54.f
+    //  55.f  56.f  57.f  58.f  59.f  60.f  61.f  62.f  63.f  64.f  65.f  66.f  67.f  68.f  69.f  70.f  71.f  72.f
+    //  73.f  74.f  75.f  76.f  77.f  78.f  79.f  80.f  81.f  82.f  83.f  84.f  85.f  86.f  87.f  88.f  89.f  90.f
+    //  91.f  92.f  93.f  94.f  95.f  96.f  97.f  98.f  99.f  100.f 101.f 102.f 103.f 104.f 105.f 106.f 107.f 108.f
+    //  109.f 110.f 111.f 112.f 113.f 114.f 115.f 116.f 117.f 118.f 119.f 120.f 121.f 122.f 123.f 124.f 125.f 126.f
+    //  127.f 128.f 129.f 130.f 131.f 132.f 133.f 134.f 135.f 136.f 137.f 138.f 139.f 140.f 141.f 142.f 143.f 144.f
+    //
+    //  Output:
+    //  1.f   2.f   1.f   2.f   1.f   2.f   9.f   10.f   9.f  10.f   9.f  10.f
+    //  17.f  18.f  17.f  18.f  17.f  18.f  31.f  32.f  27.f  28.f  25.f  26.f
+    //  39.f  40.f  35.f  6.f   33.f  34.f  47.f  48.f  43.f  44.f  41.f  42.f
+    //  51.f  52.f  51.f  52.f  51.f  52.f  59.f  60.f  59.f  60.f  59.f  60.f
+    //  67.f  68.f  67.f  68.f  67.f  68.f  77.f  78.f  73.f  74.f  79.f  80.f
+    //  85.f  86.f  81.f  82.f  87.f  88.f  93.f  94.f  89.f  90.f  95.f  96.f
+    //  103.f 104.f  99.f  100.f 97.f  98.f 111.f 112.f 107.f 108.f 105.f 106.f
+    //  119.f 120.f 115.f 116.f 113.f 114.f 125.f 126.f 121.f 122.f 123.f 124.f
+    //  133.f 134.f 129.f 130.f 131.f 132.f 141.f 142.f 137.f 138.f 139.f 140.f
+
+    auto& engine = get_test_engine();
+
+    auto input1 = engine.allocate_memory({ data_types::f16, format::bfzyx, { 3, 2, 2, 4, 3} }); // Dictionary
+    auto input2 = engine.allocate_memory({ data_types::f32, format::bfyx, { 3, 2, 1, 3 } }); // Indexes
+    auto axis = cldnn::gather::gather_axis::along_y;
+    int64_t batch_dim = -1;
+    bool negative_indexes = true;
+
+    set_values(input1, {
+        FLOAT16(1.f),   FLOAT16(2.f),   FLOAT16(3.f),   FLOAT16(4.f),   FLOAT16(5.f),   FLOAT16(6.f),   FLOAT16(7.f),   FLOAT16(8.f),
+        FLOAT16(9.f),   FLOAT16(10.f),  FLOAT16(11.f),  FLOAT16(12.f),  FLOAT16(13.f),  FLOAT16(14.f),  FLOAT16(15.f),  FLOAT16(16.f),
+        FLOAT16(17.f),  FLOAT16(18.f),  FLOAT16(19.f),  FLOAT16(20.f),  FLOAT16(21.f),  FLOAT16(22.f),  FLOAT16(23.f),  FLOAT16(24.f),
+
+        FLOAT16(25.f),  FLOAT16(26.f),  FLOAT16(27.f),  FLOAT16(28.f),  FLOAT16(29.f),  FLOAT16(30.f),  FLOAT16(31.f),  FLOAT16(32.f),
+        FLOAT16(33.f),  FLOAT16(34.f),  FLOAT16(35.f),  FLOAT16(36.f),  FLOAT16(37.f),  FLOAT16(38.f),  FLOAT16(39.f),  FLOAT16(40.f),
+        FLOAT16(41.f),  FLOAT16(42.f),  FLOAT16(43.f),  FLOAT16(44.f),  FLOAT16(45.f),  FLOAT16(46.f),  FLOAT16(47.f),  FLOAT16(48.f),
+
+
+        FLOAT16(49.f),  FLOAT16(50.f),  FLOAT16(51.f),  FLOAT16(52.f),  FLOAT16(53.f),  FLOAT16(54.f),  FLOAT16(55.f),  FLOAT16(56.f),
+        FLOAT16(57.f),  FLOAT16(58.f),  FLOAT16(59.f),  FLOAT16(60.f),  FLOAT16(61.f),  FLOAT16(62.f),  FLOAT16(63.f),  FLOAT16(64.f),
+        FLOAT16(65.f),  FLOAT16(66.f),  FLOAT16(67.f),  FLOAT16(68.f),  FLOAT16(69.f),  FLOAT16(70.f),  FLOAT16(71.f),  FLOAT16(72.f),
+
+        FLOAT16(73.f),  FLOAT16(74.f),  FLOAT16(75.f),  FLOAT16(76.f),  FLOAT16(77.f),  FLOAT16(78.f),  FLOAT16(79.f),  FLOAT16(80.f),
+        FLOAT16(81.f),  FLOAT16(82.f),  FLOAT16(83.f),  FLOAT16(84.f),  FLOAT16(85.f),  FLOAT16(86.f),  FLOAT16(87.f),  FLOAT16(88.f),
+        FLOAT16(89.f),  FLOAT16(90.f),  FLOAT16(91.f),  FLOAT16(92.f),  FLOAT16(93.f),  FLOAT16(94.f),  FLOAT16(95.f),  FLOAT16(96.f),
+
+
+        FLOAT16(97.f),  FLOAT16(98.f),  FLOAT16(99.f),  FLOAT16(100.f), FLOAT16(101.f), FLOAT16(102.f), FLOAT16(103.f), FLOAT16(104.f),
+        FLOAT16(105.f), FLOAT16(106.f), FLOAT16(107.f), FLOAT16(108.f), FLOAT16(109.f), FLOAT16(110.f), FLOAT16(111.f), FLOAT16(112.f),
+        FLOAT16(113.f), FLOAT16(114.f), FLOAT16(115.f), FLOAT16(116.f), FLOAT16(117.f), FLOAT16(118.f), FLOAT16(119.f), FLOAT16(120.f),
+
+        FLOAT16(121.f), FLOAT16(122.f), FLOAT16(123.f), FLOAT16(124.f), FLOAT16(125.f), FLOAT16(126.f), FLOAT16(127.f), FLOAT16(128.f),
+        FLOAT16(129.f), FLOAT16(130.f), FLOAT16(131.f), FLOAT16(132.f), FLOAT16(133.f), FLOAT16(134.f), FLOAT16(135.f), FLOAT16(136.f),
+        FLOAT16(137.f), FLOAT16(138.f), FLOAT16(139.f), FLOAT16(140.f), FLOAT16(141.f), FLOAT16(142.f), FLOAT16(143.f), FLOAT16(144.f)
+    });
+
+    set_values(input2, {
+        0.f, 0.f, 0.f,
+        3.f, -3.f, 0.f,
+
+        1.f, -3.f, 1.f,
+        -2.f, 0.f, 3.f,
+
+        -1.f, 1.f, 0.f,
+        2.f, 0.f, 1.f
+    });
+
+    topology topology;
+    topology.add(input_layout("InputDictionary", input1->get_layout()));
+    topology.add(input_layout("InputText", input2->get_layout()));
+    topology.add(
+        gather("gather", "InputDictionary", "InputText", axis, format::bfzyx, tensor(3, 2, 2, 3, 3), batch_dim, negative_indexes)
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("InputDictionary", input1);
+    network.set_input_data("InputText", input2);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("gather").get_memory();
+    cldnn::mem_lock<uint16_t> output_ptr(output, get_test_stream());
+
+    std::vector<float> expected_results = {
+        1.f,   2.f,   1.f,   2.f,   1.f,   2.f,
+        9.f,   10.f,  9.f,   10.f,  9.f,   10.f,
+        17.f,  18.f,  17.f,  18.f,  17.f,  18.f,
+
+        31.f,  32.f,  27.f,  28.f,  25.f,  26.f,
+        39.f,  40.f,  35.f,  36.f,  33.f,  34.f,
+        47.f,  48.f,  43.f,  44.f,  41.f,  42.f,
+
+
+        51.f,  52.f,  51.f,  52.f,  51.f,  52.f,
+        59.f,  60.f,  59.f,  60.f,  59.f,  60.f,
+        67.f,  68.f,  67.f,  68.f,  67.f,  68.f,
+
+        77.f,  78.f,  73.f,  74.f,  79.f,  80.f,
+        85.f,  86.f,  81.f,  82.f,  87.f,  88.f,
+        93.f,  94.f,  89.f,  90.f,  95.f,  96.f,
+
+
+        103.f, 104.f,  99.f,  100.f, 97.f,  98.f,
+        111.f, 112.f, 107.f, 108.f, 105.f, 106.f,
+        119.f, 120.f, 115.f, 116.f, 113.f, 114.f,
+
+        125.f, 126.f, 121.f, 122.f, 123.f, 124.f,
+        133.f, 134.f, 129.f, 130.f, 131.f, 132.f,
+        141.f, 142.f, 137.f, 138.f, 139.f, 140.f
+    };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], float16_to_float32(output_ptr[i]));
+    }
+}
+
+
 TEST(gather7_gpu_fp16, d222_axisX_bdim_m1) {
     //  Dictionary : 2x2x2x2x2x2
     //  Indexes : 2x2x2x1
diff --git a/model-optimizer/CMakeLists.txt b/model-optimizer/CMakeLists.txt
index 60860d7a7aa..19056e1b60c 100644
--- a/model-optimizer/CMakeLists.txt
+++ b/model-optimizer/CMakeLists.txt
@@ -1,10 +1,8 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-if (NOT NGRAPH_PYTHON_BUILD_ENABLE)
-    message(WARNING "Please enable nGraph Python API (_pyngraph) target to enable Model Optimizer target")
-elseif(NOT ENABLE_PYTHON)
-    message(WARNING "Please enable IE Python API (ie_api and offline_transformations_api) targets to enable Model Optimizer target")
+if(NOT ENABLE_PYTHON)
+    message(WARNING "Please enable IE & nGraph Python API (ie_api and offline_transformations_api) targets to enable Model Optimizer target")
 else()
     add_custom_target(model_optimizer DEPENDS ie_api offline_transformations_api inference_engine_ir_reader)
     if(ENABLE_TESTS)
diff --git a/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py b/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py
index b6ad3a295e9..68b8ff54e98 100644
--- a/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py
+++ b/model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py
@@ -154,7 +154,7 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern):
                     size_splits.append(l - prev_r)
                     shape[split_channel_dim] = l - prev_r
                     data_node = Op._create_data_node(graph, 'fake_data_'+out_nodes[0].name, {'shape': shape})
-                    add_opoutput(graph, data_node.id, 0, False)
+                    add_opoutput(graph, data_node.id, 0, False, keep_output_port=True)
                     final_data_nodes_list.append(data_node)
 
                 prev_r = r
@@ -167,7 +167,7 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern):
                 shape[split_channel_dim] = input_shape[split_channel_dim] - prev_r
                 size_splits.append(input_shape[split_channel_dim] - prev_r)
                 data_node = Op._create_data_node(graph, 'fake_data_'+out_nodes[0].name, {'shape': shape})
-                add_opoutput(graph, data_node.id, 0, False)
+                add_opoutput(graph, data_node.id, 0, False, keep_output_port=True)
                 final_data_nodes_list.append(data_node)
 
             for node in out_nodes:
diff --git a/model-optimizer/mo/graph/graph.py b/model-optimizer/mo/graph/graph.py
index c00b68ff810..aa1bef5e3e9 100644
--- a/model-optimizer/mo/graph/graph.py
+++ b/model-optimizer/mo/graph/graph.py
@@ -1032,21 +1032,24 @@ def dict_includes(big: dict, sub_dict: dict, skip_attr_names=[]):
     )
 
 
-def add_opoutput(graph: Graph, node_name: str, port: int, cut: bool = True):
+def add_opoutput(graph: Graph, node_name: str, port: int, cut: bool = True, keep_output_port: bool = False):
     """
     Creates and connects Result node to node_name port. Cuts existing port if requested.
     :param graph: graph to operate with
     :param node_name: name of existing node in the graph that we want to add Result to
     :param port: output port of node to connect Result to
     :param cut: determines way of operating with edge specified by node_name and port
+    :param keep_output_port: special attribute determines if this operation is saved in IR or not
     """
     # we import it here because Op imports add_attrs_props and update_ie_fields from this file
     from mo.ops.result import Result
     node = Node(graph, node_name)
     if cut and len(node.out_edges()) != 0:
-        opoutput_node = Result(graph).create_node_on_port(node, port, {'name': node_name + '/sink_port_' + str(port)})
+        opoutput_node = Result(graph).create_node_on_port(node, port, {'name': node_name + '/sink_port_' + str(port),
+                                                                       'keep_output_port': keep_output_port})
     else:
-        opoutput_node = Result(graph).create_node([(node, port)], {'name': node_name + '/sink_port_' + str(port)})
+        opoutput_node = Result(graph).create_node([(node, port)], {'name': node_name + '/sink_port_' + str(port),
+                                                                   'keep_output_port': keep_output_port})
         opoutput_node.in_edge()['data_attrs'] = ['fw_tensor_debug_info']
 
     log.debug('Sink: {} for node {}'.format(opoutput_node.id, node_name))
diff --git a/model-optimizer/unit_tests/extensions/middle/ConvertGroupedStridedSlice_test.py b/model-optimizer/unit_tests/extensions/middle/ConvertGroupedStridedSlice_test.py
index 32fe29884b0..71a957aff18 100644
--- a/model-optimizer/unit_tests/extensions/middle/ConvertGroupedStridedSlice_test.py
+++ b/model-optimizer/unit_tests/extensions/middle/ConvertGroupedStridedSlice_test.py
@@ -60,8 +60,8 @@ nodes_attributes = {
     'concat_1_data': {'value': None, 'shape': None, 'kind': 'data'},
 
     'op_output': {'kind': 'op', 'op': 'Result'},
-    'op_output_1': {'kind': 'op', 'op': 'Result'},
-    'op_output_2': {'kind': 'op', 'op': 'Result'},
+    'op_output_1': {'kind': 'op', 'op': 'Result', 'keep_output_port': True},
+    'op_output_2': {'kind': 'op', 'op': 'Result', 'keep_output_port': True},
 
     # Squeeze layers
     'sslice_1/Squeeze_shrink': {'type': None, 'value': None, 'kind': 'op', 'op': 'Squeeze'},
diff --git a/ngraph/.gitignore b/ngraph/.gitignore
index 967c8b0f6dd..588daa92a64 100644
--- a/ngraph/.gitignore
+++ b/ngraph/.gitignore
@@ -110,8 +110,6 @@ python/share/*
 \#*
 \.#*
 
-python/pybind11/
-
 # remnants from a failed in-source build
 CMakeCache.txt
 CMakeFiles/
diff --git a/ngraph/CMakeLists.txt b/ngraph/CMakeLists.txt
index 385fee2f737..20fe4a9af12 100644
--- a/ngraph/CMakeLists.txt
+++ b/ngraph/CMakeLists.txt
@@ -23,7 +23,6 @@ option(NGRAPH_DEBUG_ENABLE "Enable output for NGRAPH_DEBUG statements" OFF)
 option(NGRAPH_ONNX_IMPORT_ENABLE "Enable ONNX importer" OFF)
 option(NGRAPH_ONNX_EDITOR_ENABLE "Enable ONNX Editor" OFF)
 option(NGRAPH_PDPD_FRONTEND_ENABLE "Enable PaddlePaddle FrontEnd" OFF)
-option(NGRAPH_PYTHON_BUILD_ENABLE "Enable build nGraph python package wheel" OFF)
 option(NGRAPH_USE_PROTOBUF_LITE "Compiles and links with protobuf-lite" OFF)
 
 if (NGRAPH_ONNX_IMPORT_ENABLE OR NGRAPH_PDPD_FRONTEND_ENABLE)
@@ -37,7 +36,6 @@ message(STATUS "NGRAPH_DEBUG_ENABLE:                  ${NGRAPH_DEBUG_ENABLE}")
 message(STATUS "NGRAPH_ONNX_IMPORT_ENABLE:            ${NGRAPH_ONNX_IMPORT_ENABLE}")
 message(STATUS "NGRAPH_ONNX_EDITOR_ENABLE:            ${NGRAPH_ONNX_EDITOR_ENABLE}")
 message(STATUS "NGRAPH_PDPD_FRONTEND_ENABLE:          ${NGRAPH_PDPD_FRONTEND_ENABLE}")
-message(STATUS "NGRAPH_PYTHON_BUILD_ENABLE:           ${NGRAPH_PYTHON_BUILD_ENABLE}")
 message(STATUS "NGRAPH_USE_PROTOBUF_LITE:             ${NGRAPH_USE_PROTOBUF_LITE}")
 message(STATUS "NGRAPH_UNIT_TEST_ENABLE:              ${NGRAPH_UNIT_TEST_ENABLE}")
 message(STATUS "NGRAPH_UNIT_TEST_BACKENDS_ENABLE:     ${NGRAPH_UNIT_TEST_BACKENDS_ENABLE}")
@@ -180,14 +178,14 @@ if (NGRAPH_ONNX_IMPORT_ENABLE OR USE_STATIC_PROTOBUF)
     set(BUILD_STANDALONE_STATIC OFF)
 
     if (NOT NGRAPH_USE_SYSTEM_PROTOBUF)
-        include(cmake/external_protobuf.cmake)
+        add_subdirectory(${CMAKE_SOURCE_DIR}/thirdparty/protobuf ${CMAKE_BINARY_DIR}/_deps/protobuf)
     else()
-        find_package(Protobuf 2.6.1 REQUIRED)
+        find_package(Protobuf REQUIRED)
     endif()
 
     if (NGRAPH_ONNX_IMPORT_ENABLE)
         # target onnx_proto will be shared lib, onnx static
-        include(cmake/external_onnx.cmake)
+        add_subdirectory(${CMAKE_SOURCE_DIR}/thirdparty/onnx ${CMAKE_BINARY_DIR}/_deps/onnx)
         if (TARGET ext_protobuf)
             add_dependencies(onnx ext_protobuf)
         endif()
@@ -202,6 +200,6 @@ add_subdirectory(frontend)
 
 add_subdirectory(test)
 
-if (NGRAPH_PYTHON_BUILD_ENABLE)
+if (ENABLE_PYTHON)
     add_subdirectory(python)
 endif()
diff --git a/ngraph/cmake/external_protobuf.cmake b/ngraph/cmake/external_protobuf.cmake
deleted file mode 100644
index 021382da95a..00000000000
--- a/ngraph/cmake/external_protobuf.cmake
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (C) 2018-2021 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-
-include(FetchContent)
-
-#------------------------------------------------------------------------------
-# Download and install Google Protobuf ...
-#------------------------------------------------------------------------------
-
-set(PUSH_CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE "${CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE}")
-set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF)
-
-if (MSVC)
-    set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "")
-endif()
-
-# This version of PROTOBUF is required by Microsoft ONNX Runtime.
-set(NGRAPH_PROTOBUF_GIT_REPO_URL "https://github.com/protocolbuffers/protobuf")
-
-if(CMAKE_CROSSCOMPILING)
-    find_program(SYSTEM_PROTOC protoc PATHS ENV PATH)
-
-    if(SYSTEM_PROTOC)
-        execute_process(
-            COMMAND ${SYSTEM_PROTOC} --version
-            OUTPUT_VARIABLE PROTOC_VERSION
-            OUTPUT_STRIP_TRAILING_WHITESPACE
-        )
-
-        string(REPLACE " " ";" PROTOC_VERSION ${PROTOC_VERSION})
-        list(GET PROTOC_VERSION -1 PROTOC_VERSION)
-
-        message("Detected system protoc version: ${PROTOC_VERSION}")
-
-        if(${PROTOC_VERSION} VERSION_EQUAL "3.0.0")
-            message(WARNING "Protobuf 3.0.0 detected switching to 3.0.2 due to bug in gmock url")
-            set(PROTOC_VERSION "3.0.2")
-        endif()
-    else()
-        message(FATAL_ERROR "System Protobuf is needed while cross-compiling")
-    endif()
-
-    set(protobuf_BUILD_PROTOC_BINARIES OFF CACHE BOOL "Build libprotoc and protoc compiler" FORCE)
-elseif(NGRAPH_USE_PROTOBUF_LITE)
-    set(PROTOC_VERSION "3.9.2")
-    if(ENABLE_LTO)
-        message(WARNING "Protobuf in version 3.8.0+ can throw runtime exceptions if LTO is enabled.")
-    endif()
-else()
-    set(PROTOC_VERSION "3.7.1")
-endif()
-
-set(NGRAPH_PROTOBUF_GIT_TAG "v${PROTOC_VERSION}")
-
-
-if (CMAKE_GENERATOR STREQUAL "Ninja")
-    set(MAKE_UTIL make)
-else()
-    set(MAKE_UTIL $(MAKE))
-endif()
-
-if(PROTOC_VERSION VERSION_LESS "3.9" AND NGRAPH_USE_PROTOBUF_LITE)
-    message(FATAL_ERROR "Minimum supported version of protobuf-lite library is 3.9.0")
-else()
-    if(PROTOC_VERSION VERSION_GREATER_EQUAL "3.0")
-        if (NOT BUILD_STANDALONE_STATIC)
-            FetchContent_Declare(
-                ext_protobuf
-                GIT_REPOSITORY ${NGRAPH_PROTOBUF_GIT_REPO_URL}
-                GIT_TAG ${NGRAPH_PROTOBUF_GIT_TAG}
-                GIT_SHALLOW TRUE
-            )
-            FetchContent_GetProperties(ext_protobuf)
-            if(NOT ext_protobuf_POPULATED)
-                FetchContent_Populate(ext_protobuf)
-                set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build tests")
-                set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build with zlib support")
-                add_subdirectory(${ext_protobuf_SOURCE_DIR}/cmake ${ext_protobuf_BINARY_DIR} EXCLUDE_FROM_ALL)
-            endif()
-        endif()
-        if (USE_STATIC_PROTOBUF)
-            FetchContent_Declare(
-                    ext_protobuf_static
-                    GIT_REPOSITORY ${NGRAPH_PROTOBUF_GIT_REPO_URL}
-                    GIT_TAG ${NGRAPH_PROTOBUF_GIT_TAG}
-                    GIT_SHALLOW TRUE
-            )
-            FetchContent_GetProperties(ext_protobuf_static)
-            if((NOT ext_protobuf_static_POPULATED) AND BUILD_STANDALONE_STATIC)
-                FetchContent_Populate(ext_protobuf_static)
-                set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build tests")
-                set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build with zlib support")
-                add_subdirectory(${ext_protobuf_static_SOURCE_DIR}/cmake ${ext_protobuf_static_BINARY_DIR} EXCLUDE_FROM_ALL)
-            endif()
-        endif()
-    else()
-        message(FATAL_ERROR "Minimum supported version of protobuf library is 3.0.0")
-    endif()
-
-    if (BUILD_STANDALONE_STATIC)
-        set(Protobuf_INCLUDE_DIRS ${ext_protobuf_static_SOURCE_DIR}/src)
-    else()
-        set(Protobuf_INCLUDE_DIRS ${ext_protobuf_SOURCE_DIR}/src)
-    endif()
-    if(NGRAPH_USE_PROTOBUF_LITE)
-        set(Protobuf_LIBRARIES libprotobuf-lite)
-    else()
-        set(Protobuf_LIBRARIES libprotobuf)
-    endif()
-
-    if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
-        set(_proto_libs ${Protobuf_LIBRARIES})
-        if(TARGET libprotoc)
-            list(APPEND _proto_libs libprotoc)
-            target_compile_options(libprotoc PRIVATE -Wno-all -Wno-unused-variable)
-        endif()
-        set_target_properties(${_proto_libs} PROPERTIES
-            CXX_VISIBILITY_PRESET default
-            C_VISIBILITY_PRESET default
-            VISIBILITY_INLINES_HIDDEN OFF)
-        foreach(target libprotobuf libprotobuf-lite)
-            target_compile_options(${target}
-                PRIVATE -Wno-all -Wno-unused-variable -Wno-inconsistent-missing-override
-                PUBLIC -Wno-undef)
-        endforeach()
-    endif()
-
-    if(NGRAPH_USE_PROTOBUF_LITE)
-        # if only libprotobuf-lite is used, both libprotobuf and libprotobuf-lite are built
-        # libprotoc target needs symbols from libprotobuf, even in libprotobuf-lite configuration
-        set_target_properties(libprotobuf PROPERTIES
-            CXX_VISIBILITY_PRESET default
-            C_VISIBILITY_PRESET default
-            VISIBILITY_INLINES_HIDDEN OFF)
-    endif()
-endif()
-
-# Now make sure we restore the original flags
-set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE "${PUSH_CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE}")
-
-if (NOT BUILD_STANDALONE_STATIC)
-    message("NGRAPH_INSTALL_LIB = ${NGRAPH_INSTALL_LIB}")
-    install(TARGETS ${Protobuf_LIBRARIES}
-        RUNTIME DESTINATION ${NGRAPH_INSTALL_LIB} COMPONENT ngraph
-        ARCHIVE DESTINATION ${NGRAPH_INSTALL_LIB} COMPONENT ngraph
-        LIBRARY DESTINATION ${NGRAPH_INSTALL_LIB} COMPONENT ngraph)
-    export(TARGETS ${Protobuf_LIBRARIES} NAMESPACE ngraph:: APPEND FILE "${NGRAPH_TARGETS_FILE}")
-endif()
diff --git a/ngraph/cmake/patches/onnx_patch.diff b/ngraph/cmake/patches/onnx_patch.diff
deleted file mode 100644
index 23d7a8d7b41..00000000000
--- a/ngraph/cmake/patches/onnx_patch.diff
+++ /dev/null
@@ -1,30 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 5254f7ee..e7a0ce2b 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -380,7 +380,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
-   # So, create a object library
-   add_library(onnx OBJECT ${ONNX_SRCS})
- else()
--  add_library(onnx ${ONNX_SRCS})
-+  # onnx target doesn't export symbols
-+  add_library(onnx STATIC ${ONNX_SRCS})
- endif()
- 
- target_include_directories(onnx PUBLIC
-diff --git a/onnx/onnx_pb.h b/onnx/onnx_pb.h
-index 7dc68dea..c932b806 100644
---- a/onnx/onnx_pb.h
-+++ b/onnx/onnx_pb.h
-@@ -40,9 +40,10 @@
- //
- // This solution is similar to
- // https://github.com/pytorch/pytorch/blob/master/caffe2/core/common.h
--#if defined(ONNX_BUILD_SHARED_LIBS) || defined(ONNX_BUILD_MAIN_LIB)
-+#if defined(ONNX_BUILD_MAIN_LIB)
- #define ONNX_API ONNX_EXPORT
- #else
-+// OV as user of ONNX imports symbols
- #define ONNX_API ONNX_IMPORT
- #endif
- 
diff --git a/ngraph/core/include/ngraph/op/atan.hpp b/ngraph/core/include/ngraph/op/atan.hpp
index fc388cc228c..c15b4d0b60f 100644
--- a/ngraph/core/include/ngraph/op/atan.hpp
+++ b/ngraph/core/include/ngraph/op/atan.hpp
@@ -19,8 +19,7 @@ namespace ngraph
             class NGRAPH_API Atan : public util::UnaryElementwiseArithmetic
             {
             public:
-                static constexpr NodeTypeInfo type_info{"Atan", 0};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs an arctan operation.
                 Atan() = default;
 
diff --git a/ngraph/core/include/ngraph/op/batch_to_space.hpp b/ngraph/core/include/ngraph/op/batch_to_space.hpp
index 240ed2ba38e..b6e6afd584a 100644
--- a/ngraph/core/include/ngraph/op/batch_to_space.hpp
+++ b/ngraph/core/include/ngraph/op/batch_to_space.hpp
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "ngraph/node.hpp"
-#include "ngraph/op/util/fused_op.hpp"
+#include "ngraph/op/op.hpp"
 
 namespace ngraph
 {
@@ -27,8 +27,7 @@ namespace ngraph
             class NGRAPH_API BatchToSpace : public Op
             {
             public:
-                static constexpr NodeTypeInfo type_info{"BatchToSpace", 1};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 BatchToSpace() = default;
                 /// \brief Constructs a BatchToSpace operation.
                 ///
diff --git a/ngraph/core/include/ngraph/op/matrix_nms.hpp b/ngraph/core/include/ngraph/op/matrix_nms.hpp
new file mode 100644
index 00000000000..ca2800a921a
--- /dev/null
+++ b/ngraph/core/include/ngraph/op/matrix_nms.hpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/util/nms_base.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        namespace v8
+        {
+            /// \brief MatrixNms operation
+            ///
+            class NGRAPH_API MatrixNms : public util::NmsBase
+            {
+            public:
+                NGRAPH_RTTI_DECLARATION;
+
+                enum class DecayFunction
+                {
+                    GAUSSIAN,
+                    LINEAR
+                };
+
+                /// \brief Structure that specifies attributes of the operation
+                struct Attributes
+                {
+                    // specifies order of output elements
+                    SortResultType sort_result_type = SortResultType::NONE;
+                    // specifies whenever it is necessary to sort selected boxes across batches or
+                    // not
+                    bool sort_result_across_batch = false;
+                    // specifies the output tensor type
+                    ngraph::element::Type output_type = ngraph::element::i64;
+                    // specifies minimum score to consider box for the processing
+                    float score_threshold = 0.0f;
+                    // specifies maximum number of boxes to be selected per class, -1 meaning to
+                    // keep all boxes
+                    int nms_top_k = -1;
+                    // specifies maximum number of boxes to be selected per batch element, -1
+                    // meaning to keep all boxes
+                    int keep_top_k = -1;
+                    // specifies the background class id, -1 meaning to keep all classes
+                    int background_class = -1;
+                    // specifies decay function used to decay scores
+                    DecayFunction decay_function = DecayFunction::LINEAR;
+                    // specifies gaussian_sigma parameter for gaussian decay_function
+                    float gaussian_sigma = 2.0f;
+                    // specifies threshold to filter out boxes with low confidence score after
+                    // decaying
+                    float post_threshold = 0.0f;
+                    // specifies whether boxes are normalized or not
+                    bool normalized = true;
+                };
+
+                MatrixNms();
+
+                /// \brief Constructs a MatrixNms operation
+                ///
+                /// \param boxes Node producing the box coordinates
+                /// \param scores Node producing the box scores
+                /// \param attrs Attributes of the operation
+                MatrixNms(const Output<Node>& boxes,
+                          const Output<Node>& scores,
+                          const Attributes& attrs);
+
+                bool visit_attributes(AttributeVisitor& visitor) override;
+
+                std::shared_ptr<Node>
+                    clone_with_new_inputs(const OutputVector& new_args) const override;
+
+                /// \brief Returns attributes of the operation MatrixNms
+                const Attributes& get_attrs() const { return m_attrs; }
+
+            protected:
+                Attributes m_attrs;
+
+                void validate() override;
+            };
+        } // namespace v8
+    }     // namespace op
+    NGRAPH_API
+    std::ostream& operator<<(std::ostream& s, const op::v8::MatrixNms::DecayFunction& type);
+
+    template <>
+    class NGRAPH_API AttributeAdapter<op::v8::MatrixNms::DecayFunction>
+        : public EnumAttributeAdapterBase<op::v8::MatrixNms::DecayFunction>
+    {
+    public:
+        AttributeAdapter(op::v8::MatrixNms::DecayFunction& value)
+            : EnumAttributeAdapterBase<op::v8::MatrixNms::DecayFunction>(value)
+        {
+        }
+
+        static constexpr DiscreteTypeInfo type_info{
+            "AttributeAdapter<op::v8::MatrixNms::DecayFunction>", 1};
+        const DiscreteTypeInfo& get_type_info() const override { return type_info; }
+    };
+} // namespace ngraph
diff --git a/ngraph/core/include/ngraph/op/multiclass_nms.hpp b/ngraph/core/include/ngraph/op/multiclass_nms.hpp
new file mode 100644
index 00000000000..1b351824ab8
--- /dev/null
+++ b/ngraph/core/include/ngraph/op/multiclass_nms.hpp
@@ -0,0 +1,75 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/util/nms_base.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        namespace v8
+        {
+            /// \brief MulticlassNms operation
+            ///
+            class NGRAPH_API MulticlassNms : public util::NmsBase
+            {
+            public:
+                NGRAPH_RTTI_DECLARATION;
+
+                /// \brief Structure that specifies attributes of the operation
+                struct Attributes
+                {
+                    // specifies order of output elements
+                    SortResultType sort_result_type = SortResultType::NONE;
+                    // specifies whenever it is necessary to sort selected boxes across batches or
+                    // not
+                    bool sort_result_across_batch = false;
+                    // specifies the output tensor type
+                    ngraph::element::Type output_type = ngraph::element::i64;
+                    // specifies intersection over union threshold
+                    float iou_threshold = 0.0f;
+                    // specifies minimum score to consider box for the processing
+                    float score_threshold = 0.0f;
+                    // specifies maximum number of boxes to be selected per class, -1 meaning to
+                    // keep all boxes
+                    int nms_top_k = -1;
+                    // specifies maximum number of boxes to be selected per batch element, -1
+                    // meaning to keep all boxes
+                    int keep_top_k = -1;
+                    // specifies the background class id, -1 meaning to keep all classes
+                    int background_class = -1;
+                    // specifies eta parameter for adpative NMS, in close range [0, 1.0]
+                    float nms_eta = 1.0f;
+                    // specifies whether boxes are normalized or not
+                    bool normalized = true;
+                };
+
+                MulticlassNms();
+
+                /// \brief Constructs a MulticlassNms operation
+                ///
+                /// \param boxes Node producing the box coordinates
+                /// \param scores Node producing the box scores
+                /// \param attrs Attributes of the operation
+                MulticlassNms(const Output<Node>& boxes,
+                              const Output<Node>& scores,
+                              const Attributes& attrs);
+
+                bool visit_attributes(AttributeVisitor& visitor) override;
+
+                std::shared_ptr<Node>
+                    clone_with_new_inputs(const OutputVector& new_args) const override;
+
+                /// \brief Returns attributes of the operation MulticlassNms
+                const Attributes& get_attrs() const { return m_attrs; }
+
+            protected:
+                Attributes m_attrs;
+                void validate() override;
+            };
+        } // namespace v8
+    }     // namespace op
+} // namespace ngraph
diff --git a/ngraph/core/include/ngraph/op/shape_of.hpp b/ngraph/core/include/ngraph/op/shape_of.hpp
index 1fb26548008..932ea4c56e0 100644
--- a/ngraph/core/include/ngraph/op/shape_of.hpp
+++ b/ngraph/core/include/ngraph/op/shape_of.hpp
@@ -33,14 +33,6 @@ namespace ngraph
                 // Overload collision with method on Node
                 using Node::set_output_type;
 
-                // FOR CONSTANT FOLDING INTERNAL USAGE ONLY
-                // Constant folding for cases with static rank but dynamic shape create a subgraph
-                // which contains a Shape of.
-                // In this case we need to prevent constant folding from endless creation of these
-                // subgraphs.
-                // These metods should be removed if better solution will be designed.
-                void set_is_foldable(bool is_foldable) { m_is_foldable = is_foldable; }
-                bool get_is_foldable() const { return m_is_foldable; }
                 bool evaluate(const HostTensorVector& output_values,
                               const HostTensorVector& input_values) const override;
                 bool has_evaluate() const override;
@@ -50,7 +42,6 @@ namespace ngraph
                                    const OutputVector& input_values) override;
 
             private:
-                bool m_is_foldable = true;
                 element::Type m_output_type;
             };
         } // namespace v3
@@ -72,14 +63,6 @@ namespace ngraph
 
                 void validate_and_infer_types() override;
 
-                // FOR CONSTANT FOLDING INTERNAL USAGE ONLY
-                // Constant folding for cases with static rank but dynamic shape create a subgraph
-                // which contains a Shape of.
-                // In this case we need to prevent constant folding from endless creation of these
-                // subgraphs.
-                // These metods should be removed if better solution will be designed.
-                void set_is_foldable(bool is_foldable) { m_is_foldable = is_foldable; }
-                bool get_is_foldable() const { return m_is_foldable; }
                 bool evaluate(const HostTensorVector& output_values,
                               const HostTensorVector& input_values) const override;
                 bool has_evaluate() const override;
@@ -87,9 +70,6 @@ namespace ngraph
                 bool evaluate_upper(const HostTensorVector& output_values) const override;
                 bool constant_fold(OutputVector& output_values,
                                    const OutputVector& input_values) override;
-
-            private:
-                bool m_is_foldable = true;
             };
         } // namespace v0
         using v0::ShapeOf;
diff --git a/ngraph/core/include/ngraph/op/sinh.hpp b/ngraph/core/include/ngraph/op/sinh.hpp
index 6715e0cf682..1ca6c47ffec 100644
--- a/ngraph/core/include/ngraph/op/sinh.hpp
+++ b/ngraph/core/include/ngraph/op/sinh.hpp
@@ -16,8 +16,7 @@ namespace ngraph
             class NGRAPH_API Sinh : public util::UnaryElementwiseArithmetic
             {
             public:
-                static constexpr NodeTypeInfo type_info{"Sinh", 0};
-                const NodeTypeInfo& get_type_info() const override { return type_info; }
+                NGRAPH_RTTI_DECLARATION;
                 /// \brief Constructs a hyperbolic sine operation.
                 ///
                 /// \param arg Node that produces the input tensor.
diff --git a/ngraph/core/include/ngraph/op/space_to_depth.hpp b/ngraph/core/include/ngraph/op/space_to_depth.hpp
index 1cd775a878f..2fe2ccb0eff 100644
--- a/ngraph/core/include/ngraph/op/space_to_depth.hpp
+++ b/ngraph/core/include/ngraph/op/space_to_depth.hpp
@@ -4,9 +4,7 @@
 
 #pragma once
 
-#include "ngraph/node.hpp"
-#include "ngraph/op/util/fused_op.hpp"
-#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/op/op.hpp"
 
 namespace ngraph
 {
@@ -63,10 +61,6 @@ namespace ngraph
             protected:
                 std::size_t m_blocksize;
                 SpaceToDepthMode m_mode;
-
-            private:
-                bool evaluate_space_to_depth(const HostTensorVector& outputs,
-                                             const HostTensorVector& inputs) const;
             };
         } // namespace v0
         using v0::SpaceToDepth;
diff --git a/ngraph/core/include/ngraph/op/util/nms_base.hpp b/ngraph/core/include/ngraph/op/util/nms_base.hpp
new file mode 100644
index 00000000000..8983cbe7804
--- /dev/null
+++ b/ngraph/core/include/ngraph/op/util/nms_base.hpp
@@ -0,0 +1,93 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        namespace util
+        {
+            /// \brief Base class for operations NmsBase and MatrixNms
+            ///
+            class NGRAPH_API NmsBase : public Op
+            {
+            public:
+                NGRAPH_RTTI_DECLARATION;
+                enum class SortResultType
+                {
+                    CLASSID, // sort selected boxes by class id (ascending) in each batch element
+                    SCORE,   // sort selected boxes by score (descending) in each batch element
+                    NONE     // do not guarantee the order in each batch element
+                };
+
+                NmsBase() = delete;
+
+                /// \brief Constructs a NmsBase operation
+                ///
+                /// \param output_type Specifies the output tensor type
+                /// \param nms_top_k Specifies maximum number of boxes to be selected per
+                /// class, -1 meaning to keep all boxes
+                /// \param keep_top_k Specifies maximum number of boxes to be selected per
+                /// batch element, -1 meaning to keep all boxes
+                NmsBase(ngraph::element::Type& output_type, int& nms_top_k, int& keep_top_k);
+
+                /// \brief Constructs a NmsBase operation
+                ///
+                /// \param boxes Node producing the box coordinates
+                /// \param scores Node producing the box scores
+                /// \param output_type Specifies the output tensor type
+                /// \param nms_top_k Specifies maximum number of boxes to be selected per
+                /// class, -1 meaning to keep all boxes
+                /// \param keep_top_k Specifies maximum number of boxes to be selected per
+                /// batch element, -1 meaning to keep all boxes
+                NmsBase(const Output<Node>& boxes,
+                        const Output<Node>& scores,
+                        ngraph::element::Type& output_type,
+                        int& nms_top_k,
+                        int& keep_top_k);
+
+                void validate_and_infer_types() override;
+
+                const element::Type& get_output_type() const { return m_output_type; }
+                void set_output_type(const element::Type& output_type)
+                {
+                    m_output_type = output_type;
+                }
+                using Node::set_output_type;
+
+                int get_nms_top_k() const { return m_nms_top_k; }
+
+                int get_keep_top_k() const { return m_keep_top_k; }
+
+            protected:
+                ngraph::element::Type& m_output_type;
+                int& m_nms_top_k;
+                int& m_keep_top_k;
+                virtual void validate();
+            };
+        } // namespace util
+    }     // namespace op
+
+    NGRAPH_API
+    std::ostream& operator<<(std::ostream& s, const op::util::NmsBase::SortResultType& type);
+
+    template <>
+    class NGRAPH_API AttributeAdapter<op::util::NmsBase::SortResultType>
+        : public EnumAttributeAdapterBase<op::util::NmsBase::SortResultType>
+    {
+    public:
+        AttributeAdapter(op::util::NmsBase::SortResultType& value)
+            : EnumAttributeAdapterBase<op::util::NmsBase::SortResultType>(value)
+        {
+        }
+
+        static constexpr DiscreteTypeInfo type_info{
+            "AttributeAdapter<op::util::NmsBase::SortResultType>", 1};
+        const DiscreteTypeInfo& get_type_info() const override { return type_info; }
+    };
+} // namespace ngraph
diff --git a/ngraph/core/include/ngraph/ops.hpp b/ngraph/core/include/ngraph/ops.hpp
index 6999dc93c9d..4701a2f733f 100644
--- a/ngraph/core/include/ngraph/ops.hpp
+++ b/ngraph/core/include/ngraph/ops.hpp
@@ -85,6 +85,7 @@
 #include "ngraph/op/lstm_cell.hpp"
 #include "ngraph/op/lstm_sequence.hpp"
 #include "ngraph/op/matmul.hpp"
+#include "ngraph/op/matrix_nms.hpp"
 #include "ngraph/op/max.hpp"
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/maximum.hpp"
@@ -92,6 +93,7 @@
 #include "ngraph/op/minimum.hpp"
 #include "ngraph/op/mish.hpp"
 #include "ngraph/op/mod.hpp"
+#include "ngraph/op/multiclass_nms.hpp"
 #include "ngraph/op/multiply.hpp"
 #include "ngraph/op/mvn.hpp"
 #include "ngraph/op/negative.hpp"
diff --git a/ngraph/core/include/ngraph/opsets/opset8_tbl.hpp b/ngraph/core/include/ngraph/opsets/opset8_tbl.hpp
index ad4d641027d..0004161dc48 100644
--- a/ngraph/core/include/ngraph/opsets/opset8_tbl.hpp
+++ b/ngraph/core/include/ngraph/opsets/opset8_tbl.hpp
@@ -179,3 +179,5 @@ NGRAPH_OP(Gather, ngraph::op::v8)
 NGRAPH_OP(AdaptiveAvgPool, ngraph::op::v8)
 NGRAPH_OP(AdaptiveMaxPool, ngraph::op::v8)
 NGRAPH_OP(DeformableConvolution, ngraph::op::v8)
+NGRAPH_OP(MatrixNms, ngraph::op::v8)
+NGRAPH_OP(MulticlassNms, ngraph::op::v8)
\ No newline at end of file
diff --git a/ngraph/core/include/ngraph/variant.hpp b/ngraph/core/include/ngraph/variant.hpp
index 90b87cb5b37..aeb67f79b9e 100644
--- a/ngraph/core/include/ngraph/variant.hpp
+++ b/ngraph/core/include/ngraph/variant.hpp
@@ -22,6 +22,7 @@ namespace ngraph
 
         virtual std::shared_ptr<ngraph::Variant> init(const std::shared_ptr<ngraph::Node>& node);
         virtual std::shared_ptr<ngraph::Variant> merge(const ngraph::NodeVector& nodes);
+        virtual std::string to_string() { return ""; }
     };
 
     template <typename VT>
diff --git a/ngraph/core/reference/include/ngraph/coordinate_transform.hpp b/ngraph/core/reference/include/ngraph/coordinate_transform.hpp
index 06320ce8925..86bf13b24be 100644
--- a/ngraph/core/reference/include/ngraph/coordinate_transform.hpp
+++ b/ngraph/core/reference/include/ngraph/coordinate_transform.hpp
@@ -7,6 +7,7 @@
 #include "ngraph/axis_vector.hpp"
 #include "ngraph/coordinate.hpp"
 #include "ngraph/coordinate_diff.hpp"
+#include "ngraph/deprecated.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/strides.hpp"
 
@@ -17,6 +18,7 @@ namespace ngraph
     ///        produces the following coordinates:
     ///             {0,0}, {0,1}, {0,2},
     ///             {1,0}, {1,1}, {2,2}
+    /// \deprecated
     class CoordinateIterator
     {
         /// \brief Coordinates iterator constructor
@@ -79,6 +81,8 @@ namespace ngraph
 
         /// \brief The tensor element index calculation by given coordinate.
         /// \param c tensor element coordinate
+        /// \deprecated
+        NGRAPH_DEPRECATED("This method is deprecated and will be removed soon.")
         size_t index(const Coordinate& c) const noexcept;
 
         /// \brief Returns an iterator to the first coordinate of the tensor.
@@ -94,7 +98,9 @@ namespace ngraph
     /// \brief Class which allows to calculate item index with given coordinates in tensor
     ///        and helps to iterate over the subset of coordinates.
     ///        Tensor items should be placed in memory in row-major order.
-    class CoordinateTransform : protected CoordinateTransformBasic
+    /// \deprecated
+    class NGRAPH_DEPRECATED("This class is deprecated and will be removed soon.")
+        CoordinateTransform : protected CoordinateTransformBasic
     {
     public:
         using Iterator = CoordinateIterator;
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/adaptive_avg_pool.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/adaptive_avg_pool.hpp
new file mode 100644
index 00000000000..a7ec91b5975
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/adaptive_avg_pool.hpp
@@ -0,0 +1,177 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+#include "ngraph/axis_vector.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            namespace adaptive_pool
+            {
+                inline size_t window_start(size_t idx, size_t arg_shape, size_t out_shape)
+                {
+                    // start = floor(idx * arg_shape / out_shape);
+                    return idx * arg_shape / out_shape;
+                }
+                inline size_t window_end(size_t idx, size_t arg_shape, size_t out_shape)
+                {
+                    return ceil(static_cast<double>((idx + 1) * arg_shape) / out_shape);
+                }
+                template <typename T>
+                T avg_div(const T sum, size_t n)
+                {
+                    NGRAPH_CHECK(n != 0, "AdaptiveAvgPool elements == 0, must be non-zero");
+
+                    if (std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value)
+                    {
+                        return static_cast<T>(std::nearbyint(static_cast<float>(sum) / n));
+                    }
+                    else
+                    {
+                        return sum / n;
+                    }
+                }
+
+                template <typename T>
+                void adaptive_avg_pool_1d(const T* arg, T* out, size_t h_in, size_t h_out)
+                {
+                    for (size_t i = 0; i < h_out; i++)
+                    {
+                        size_t h_start = window_start(i, h_in, h_out);
+                        size_t h_end = window_end(i, h_in, h_out);
+                        out[i] = avg_div(std::accumulate(arg + h_start, arg + h_end, T{0}),
+                                         h_end - h_start);
+                    }
+                }
+                template <typename T>
+                void adaptive_avg_pool_2d(
+                    const T* arg, T* out, size_t h_in, size_t h_out, size_t w_in, size_t w_out)
+                {
+                    for (size_t i = 0; i < h_out; i++)
+                    {
+                        size_t h_start = window_start(i, h_in, h_out);
+                        size_t h_end = window_end(i, h_in, h_out);
+                        for (size_t j = 0; j < w_out; j++)
+                        {
+                            size_t w_start = window_start(j, w_in, w_out);
+                            size_t w_end = window_end(j, w_in, w_out);
+                            T result = 0;
+                            for (size_t n = h_start; n < h_end; n++)
+                            {
+                                result = std::accumulate(
+                                    arg + n * w_in + w_start, arg + n * w_in + w_end, result);
+                            }
+                            out[i * w_out + j] =
+                                avg_div(result, (w_end - w_start) * (h_end - h_start));
+                        }
+                    }
+                }
+                template <typename T>
+                void adaptive_avg_pool_3d(const T* arg,
+                                          T* out,
+                                          size_t d_in,
+                                          size_t d_out,
+                                          size_t h_in,
+                                          size_t h_out,
+                                          size_t w_in,
+                                          size_t w_out)
+                {
+                    for (size_t i = 0; i < d_out; i++)
+                    {
+                        size_t d_start = window_start(i, d_in, d_out);
+                        size_t d_end = window_end(i, d_in, d_out);
+                        for (size_t j = 0; j < h_out; j++)
+                        {
+                            size_t h_start = window_start(j, h_in, h_out);
+                            size_t h_end = window_end(j, h_in, h_out);
+                            for (size_t k = 0; k < w_out; k++)
+                            {
+                                size_t w_start = window_start(k, w_in, w_out);
+                                size_t w_end = window_end(k, w_in, w_out);
+                                T result = 0;
+                                for (size_t n = d_start; n < d_end; n++)
+                                {
+                                    for (size_t m = h_start; m < h_end; m++)
+                                    {
+                                        auto pos = arg + n * h_in * w_in + m * w_in;
+                                        result =
+                                            std::accumulate(pos + w_start, pos + w_end, result);
+                                    }
+                                }
+                                out[i * h_out * w_out + j * w_out + k] = avg_div(
+                                    result,
+                                    (d_end - d_start) * (w_end - w_start) * (h_end - h_start));
+                            }
+                        }
+                    }
+                }
+            } // namespace adaptive_pool
+            template <typename T>
+            void adaptive_avg_pool(const T* arg,
+                                   T* out,
+                                   const Shape& arg_shape,
+                                   const Shape& out_shape)
+            {
+                NGRAPH_CHECK(arg_shape.size() == out_shape.size() && 2 < arg_shape.size() &&
+                                 arg_shape.size() < 6,
+                             "AdaptiveAvgPool supports only 3D, 4D and 5D input shape");
+                size_t channel_size = 1;
+                for (size_t i = 2; i < arg_shape.size(); i++)
+                {
+                    channel_size *= arg_shape[i];
+                }
+                size_t batch_size = arg_shape[1] * channel_size;
+                size_t out_channel_size = 1;
+                for (size_t i = 2; i < out_shape.size(); i++)
+                {
+                    out_channel_size *= out_shape[i];
+                }
+                size_t out_batch_size = arg_shape[1] * out_channel_size;
+                for (size_t b = 0; b < arg_shape[0]; b++)
+                {
+                    for (size_t c = 0; c < arg_shape[1]; c++)
+                    {
+                        auto arg_pos = arg + b * batch_size + c * channel_size;
+                        auto out_pos = out + b * out_batch_size + c * out_channel_size;
+                        if (arg_shape.size() == 3)
+                        {
+                            adaptive_pool::adaptive_avg_pool_1d<T>(
+                                arg_pos, out_pos, arg_shape[2], out_shape[2]);
+                        }
+                        else if (arg_shape.size() == 4)
+                        {
+                            adaptive_pool::adaptive_avg_pool_2d<T>(arg_pos,
+                                                                   out_pos,
+                                                                   arg_shape[2],
+                                                                   out_shape[2],
+                                                                   arg_shape[3],
+                                                                   out_shape[3]);
+                        }
+                        else if (arg_shape.size() == 5)
+                        {
+                            adaptive_pool::adaptive_avg_pool_3d<T>(arg_pos,
+                                                                   out_pos,
+                                                                   arg_shape[2],
+                                                                   out_shape[2],
+                                                                   arg_shape[3],
+                                                                   out_shape[3],
+                                                                   arg_shape[4],
+                                                                   out_shape[4]);
+                        }
+                    }
+                }
+            }
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/adaptive_max_pool.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/adaptive_max_pool.hpp
new file mode 100644
index 00000000000..c235a2a4405
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/adaptive_max_pool.hpp
@@ -0,0 +1,170 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+#include "ngraph/axis_vector.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            template <typename T>
+            void adaptive_max_pool_1d(
+                const T* arg, T* out, int64_t* indices, size_t h_in, size_t h_out)
+            {
+                for (size_t i = 0; i < h_out; i++)
+                {
+                    auto from = arg + adaptive_pool::window_start(i, h_in, h_out);
+                    auto to = arg + adaptive_pool::window_end(i, h_in, h_out);
+                    NGRAPH_CHECK(to - from != 0, "AdaptiveMaxPool elements == 0, must be non-zero");
+                    auto it = std::max_element(from, to);
+                    out[i] = *it;
+                    indices[i] = it - arg;
+                }
+            }
+            template <typename T>
+            void adaptive_max_pool_2d(const T* arg,
+                                      T* out,
+                                      int64_t* indices,
+                                      size_t h_in,
+                                      size_t h_out,
+                                      size_t w_in,
+                                      size_t w_out)
+            {
+                for (size_t i = 0; i < h_out; i++)
+                {
+                    size_t h_start = adaptive_pool::window_start(i, h_in, h_out);
+                    size_t h_end = adaptive_pool::window_end(i, h_in, h_out);
+                    for (size_t j = 0; j < w_out; j++)
+                    {
+                        size_t w_start = adaptive_pool::window_start(j, w_in, w_out);
+                        size_t w_end = adaptive_pool::window_end(j, w_in, w_out);
+                        NGRAPH_CHECK((w_end - w_start) * (h_end - h_start) != 0,
+                                     "AdaptiveMaxPool elements == 0, must be non-zero");
+                        auto result = arg + h_start * w_in + w_start;
+                        for (size_t n = h_start; n < h_end; n++)
+                        {
+                            auto from = arg + n * w_in + w_start;
+                            auto to = arg + n * w_in + w_end;
+                            auto it = std::max_element(from, to);
+                            result = *it > *result ? it : result;
+                        }
+                        out[i * w_out + j] = *result;
+                        indices[i * w_out + j] = result - arg;
+                    }
+                }
+            }
+            template <typename T>
+            void adaptive_max_pool_3d(const T* arg,
+                                      T* out,
+                                      int64_t* indices,
+                                      size_t d_in,
+                                      size_t d_out,
+                                      size_t h_in,
+                                      size_t h_out,
+                                      size_t w_in,
+                                      size_t w_out)
+            {
+                for (size_t i = 0; i < d_out; i++)
+                {
+                    size_t d_start = adaptive_pool::window_start(i, d_in, d_out);
+                    size_t d_end = adaptive_pool::window_end(i, d_in, d_out);
+                    for (size_t j = 0; j < h_out; j++)
+                    {
+                        size_t h_start = adaptive_pool::window_start(j, h_in, h_out);
+                        size_t h_end = adaptive_pool::window_end(j, h_in, h_out);
+                        for (size_t k = 0; k < w_out; k++)
+                        {
+                            size_t w_start = adaptive_pool::window_start(k, w_in, w_out);
+                            size_t w_end = adaptive_pool::window_end(k, w_in, w_out);
+                            NGRAPH_CHECK((w_end - w_start) * (h_end - h_start) != 0,
+                                         "AdaptiveMaxPool elements == 0, must be non-zero");
+                            auto result = arg + d_start * h_in * w_in + h_start * w_in + w_start;
+                            for (size_t n = d_start; n < d_end; n++)
+                            {
+                                for (size_t m = h_start; m < h_end; m++)
+                                {
+                                    auto from = arg + n * h_in * w_in + m * w_in + w_start;
+                                    auto to = arg + n * h_in * w_in + m * w_in + w_end;
+                                    auto it = std::max_element(from, to);
+                                    result = *it > *result ? it : result;
+                                }
+                            }
+                            out[i * h_out * w_out + j * w_out + k] = *result;
+                            indices[i * h_out * w_out + j * w_out + k] = result - arg;
+                        }
+                    }
+                }
+            }
+            template <typename T>
+            void adaptive_max_pool(const T* arg,
+                                   T* out,
+                                   int64_t* selected_indices,
+                                   const Shape& arg_shape,
+                                   const Shape& out_shape)
+            {
+                NGRAPH_CHECK(arg_shape.size() == out_shape.size() && 2 < arg_shape.size() &&
+                                 arg_shape.size() < 6,
+                             "AdaptiveAvgPool supports only 3D, 4D and 5D input shape");
+                size_t channel_size = 1;
+                for (size_t i = 2; i < arg_shape.size(); i++)
+                {
+                    channel_size *= arg_shape[i];
+                }
+                size_t batch_size = arg_shape[1] * channel_size;
+                size_t out_channel_size = 1;
+                for (size_t i = 2; i < out_shape.size(); i++)
+                {
+                    out_channel_size *= out_shape[i];
+                }
+                size_t out_batch_size = arg_shape[1] * out_channel_size;
+                for (size_t b = 0; b < arg_shape[0]; b++)
+                {
+                    for (size_t c = 0; c < arg_shape[1]; c++)
+                    {
+                        auto arg_pos = arg + b * batch_size + c * channel_size;
+                        auto out_pos = out + b * out_batch_size + c * out_channel_size;
+                        auto sel_ind_pos =
+                            selected_indices + b * out_batch_size + c * out_channel_size;
+                        if (arg_shape.size() == 3)
+                        {
+                            adaptive_max_pool_1d<T>(
+                                arg_pos, out_pos, sel_ind_pos, arg_shape[2], out_shape[2]);
+                        }
+                        else if (arg_shape.size() == 4)
+                        {
+                            adaptive_max_pool_2d<T>(arg_pos,
+                                                    out_pos,
+                                                    sel_ind_pos,
+                                                    arg_shape[2],
+                                                    out_shape[2],
+                                                    arg_shape[3],
+                                                    out_shape[3]);
+                        }
+                        else if (arg_shape.size() == 5)
+                        {
+                            adaptive_max_pool_3d<T>(arg_pos,
+                                                    out_pos,
+                                                    sel_ind_pos,
+                                                    arg_shape[2],
+                                                    out_shape[2],
+                                                    arg_shape[3],
+                                                    out_shape[3],
+                                                    arg_shape[4],
+                                                    out_shape[4]);
+                        }
+                    }
+                }
+            }
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/atan.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/atan.hpp
index 03dcdf525f2..a5e8f21c662 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/atan.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/atan.hpp
@@ -13,7 +13,8 @@ namespace ngraph
     {
         namespace reference
         {
-            template <typename T>
+            template <typename T,
+                      typename std::enable_if<!std::is_integral<T>::value, bool>::type = true>
             void atan(const T* arg, T* out, size_t count)
             {
                 for (size_t i = 0; i < count; i++)
@@ -21,6 +22,16 @@ namespace ngraph
                     out[i] = std::atan(arg[i]);
                 }
             }
+
+            template <typename T,
+                      typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+            void atan(const T* arg, T* out, size_t count)
+            {
+                for (size_t i = 0; i < count; i++)
+                {
+                    out[i] = std::roundl(std::atan(arg[i]));
+                }
+            }
         } // namespace reference
     }     // namespace runtime
 } // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/autobroadcast_binop.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/autobroadcast_binop.hpp
index 33ff0672b25..46604bf3865 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/autobroadcast_binop.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/autobroadcast_binop.hpp
@@ -315,6 +315,7 @@ namespace ngraph
                             }
                         }
 
+                        NGRAPH_SUPPRESS_DEPRECATED_START
                         CoordinateTransform arg0_transform(arg0_shape);
                         CoordinateTransform arg1_transform(arg1_squeezed_shape);
                         CoordinateTransform output_transform(arg0_shape);
@@ -326,6 +327,7 @@ namespace ngraph
                                 elementwise_functor(arg0[arg0_transform.index(output_coord)],
                                                     arg1[arg1_transform.index(arg1_coord)]);
                         }
+                        NGRAPH_SUPPRESS_DEPRECATED_END
                     }
                 }
             }
@@ -437,6 +439,7 @@ namespace ngraph
                                                              arg1_padded_shape[i]}));
                         }
 
+                        NGRAPH_SUPPRESS_DEPRECATED_START
                         CoordinateTransform arg0_transform(arg0_squeezed_shape);
                         CoordinateTransform arg1_transform(arg1_squeezed_shape);
                         CoordinateTransform arg2_transform(arg2_squeezed_shape);
@@ -452,6 +455,7 @@ namespace ngraph
                                                     arg1[arg1_transform.index(arg1_coord)],
                                                     arg2[arg2_transform.index(arg2_coord)]);
                         }
+                        NGRAPH_SUPPRESS_DEPRECATED_END
                     }
                     break;
                 case op::AutoBroadcastType::PDPD:
@@ -521,6 +525,7 @@ namespace ngraph
                         }
                     }
 
+                    NGRAPH_SUPPRESS_DEPRECATED_START
                     CoordinateTransform arg0_transform(arg0_squeezed_shape);
                     CoordinateTransform arg1_transform(arg1_shape);
                     CoordinateTransform arg2_transform(arg2_squeezed_shape);
@@ -535,6 +540,7 @@ namespace ngraph
                                                 arg1[arg1_transform.index(output_coord)],
                                                 arg2[arg2_transform.index(arg2_coord)]);
                     }
+                    NGRAPH_SUPPRESS_DEPRECATED_END
                 }
                 }
             }
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/avg_pool.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/avg_pool.hpp
index e5c1be08788..1f897330117 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/avg_pool.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/avg_pool.hpp
@@ -31,6 +31,7 @@ namespace ngraph
                                    const Shape& padding_above,
                                    bool include_padding_in_avg_computation)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform out_transform(out_shape);
 
                 for (const Coordinate& out_coord : out_transform)
@@ -107,6 +108,7 @@ namespace ngraph
                         }
                     }
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
 
             template <typename T>
@@ -120,6 +122,7 @@ namespace ngraph
                           const Shape& padding_above,
                           bool include_padding_in_avg_computation)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 auto old_mode = std::fegetround();
                 std::fesetround(FE_TONEAREST);
                 // At the outermost level we will walk over every output coordinate O.
@@ -259,6 +262,7 @@ namespace ngraph
                     }
                     std::fesetround(old_mode);
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/batch_norm.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/batch_norm.hpp
index 48d93baaf23..e2c67e3b0ea 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/batch_norm.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/batch_norm.hpp
@@ -32,6 +32,7 @@ namespace ngraph
                                       T* out,
                                       const Shape& in_shape)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 auto eps_casted = static_cast<T>(eps);
 
                 size_t in_idx = 0;
@@ -48,6 +49,7 @@ namespace ngraph
                     out[in_idx] = normalized * ch_gamma + ch_beta;
                     in_idx++;
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/ctc_greedy_decoder.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/ctc_greedy_decoder.hpp
index a9bbb70faf9..2e41850ed16 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/ctc_greedy_decoder.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/ctc_greedy_decoder.hpp
@@ -23,6 +23,7 @@ namespace ngraph
                                     const Shape& out_shape,
                                     const bool ctc_merge_repeated)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 const auto max_seq_len = data_shape[0];
                 const auto batch_size = data_shape[1];
                 const auto class_count = data_shape[2];
@@ -66,6 +67,7 @@ namespace ngraph
                     }
                 }
                 std::copy(tmp_out.begin(), tmp_out.end(), out);
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/cum_sum.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/cum_sum.hpp
index bc1f490e3bd..14186f0172c 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/cum_sum.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/cum_sum.hpp
@@ -27,6 +27,7 @@ namespace ngraph
                         const bool exclusive,
                         const bool reverse)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform temp_transform(tensor_shape);
                 for (const Coordinate& output_coord : temp_transform)
                 {
@@ -126,6 +127,7 @@ namespace ngraph
                 {
                     cum_sum(it.second);
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/deformable_convolution.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/deformable_convolution.hpp
index c6ee668fbe0..28f7ddc841e 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/deformable_convolution.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/deformable_convolution.hpp
@@ -99,14 +99,16 @@ namespace ngraph
 
                 template <typename T>
                 void convolve_2D_channels(const ConvolutionParams& p,
-                                          const int64_t deformable_groups,
                                           const T* batch,
                                           const Shape& batch_shape,
                                           const T* offsets,
                                           const Shape& offset_shape,
                                           const T* filter,
                                           const Shape& filter_shape,
-                                          T* out)
+                                          T* out,
+                                          size_t group_idx,
+                                          int64_t groups,
+                                          int64_t deformable_groups)
                 {
                     const int input_size_y = batch_shape[1];
                     const int input_size_x = batch_shape[2];
@@ -121,7 +123,6 @@ namespace ngraph
                     const int filter_channel_size = shape_size(shape_reduce(filter_shape));
                     const int offsets_size = shape_size(offset_shape);
                     const int offsets_spatial_size = shape_size(shape_reduce(offset_shape));
-                    const int offsets_channel_size = 2 * offsets_spatial_size;
                     const int filter_channels_count = filter_shape[0];
 
                     int out_idx = 0;
@@ -136,42 +137,42 @@ namespace ngraph
                             auto input_channel = batch;
                             auto filter_channel = filter;
                             T sum = 0;
-                            auto group_offsets_channel = offsets;
-                            for (int dg = 0; dg < deformable_groups; dg++)
+                            for (int fc = 0; fc < filter_channels_count; fc++)
                             {
-                                for (int fc = 0; fc < filter_channels_count / deformable_groups;
-                                     fc++)
+                                auto deformable_group_idx =
+                                    (filter_channels_count * group_idx + fc) /
+                                    (filter_channels_count * groups / deformable_groups);
+                                for (int f_y = 0; f_y < filter_size_y; ++f_y)
                                 {
-                                    auto offsets_channel = group_offsets_channel;
-                                    for (int f_y = 0; f_y < filter_size_y; ++f_y)
+                                    for (int f_x = 0; f_x < filter_size_x; ++f_x)
                                     {
-                                        for (int f_x = 0; f_x < filter_size_x; ++f_x)
-                                        {
-                                            T y_offset = offsets_channel[out_idx];
-                                            T x_offset =
-                                                offsets_channel[offsets_spatial_size + out_idx];
-                                            T rel_i_y = i_y + (f_y * p.dilation[0]) + y_offset;
-                                            T rel_i_x = i_x + (f_x * p.dilation[1]) + x_offset;
+                                        T y_offset = offsets[deformable_group_idx * offsets_size +
+                                                             (f_y * filter_size_x + f_x) * 2 *
+                                                                 offsets_spatial_size +
+                                                             out_idx];
+                                        T x_offset = offsets[deformable_group_idx * offsets_size +
+                                                             ((f_y * filter_size_x + f_x) * 2 + 1) *
+                                                                 offsets_spatial_size +
+                                                             out_idx];
+                                        T rel_i_y = i_y + (f_y * p.dilation[0]) + y_offset;
+                                        T rel_i_x = i_x + (f_x * p.dilation[1]) + x_offset;
 
-                                            offsets_channel += offsets_channel_size;
-                                            bool padding = !(in_range(rel_i_x, {0, input_size_x}) &&
-                                                             in_range(rel_i_y, {0, input_size_y}));
-                                            if (padding)
-                                                continue;
+                                        bool padding = !(in_range(rel_i_x, {0, input_size_x}) &&
+                                                         in_range(rel_i_y, {0, input_size_y}));
+                                        if (padding)
+                                            continue;
 
-                                            int f_buf_idx = (f_y * filter_size_x) + f_x;
-                                            sum += bilinear_interpolation(input_channel,
-                                                                          rel_i_x,
-                                                                          rel_i_y,
-                                                                          input_size_x,
-                                                                          input_size_y) *
-                                                   filter_channel[f_buf_idx];
-                                        }
+                                        int f_buf_idx = (f_y * filter_size_x) + f_x;
+                                        sum += bilinear_interpolation(input_channel,
+                                                                      rel_i_x,
+                                                                      rel_i_y,
+                                                                      input_size_x,
+                                                                      input_size_y) *
+                                               filter_channel[f_buf_idx];
                                     }
-                                    input_channel += input_channel_size;
-                                    filter_channel += filter_channel_size;
                                 }
-                                group_offsets_channel += offsets_size / deformable_groups;
+                                input_channel += input_channel_size;
+                                filter_channel += filter_channel_size;
                             }
                             out[out_idx++] = sum;
                         }
@@ -218,11 +219,9 @@ namespace ngraph
                 const Shape group_in_shape = shape_scale(shape_reduce(in_shape), groups);
                 const size_t group_in_size = shape_size(group_in_shape);
 
-                const Shape group_offset_shape = shape_scale(shape_reduce(o_shape), groups);
-                const size_t group_offset_size = shape_size(group_offset_shape);
+                const Shape group_offset_shape =
+                    shape_scale(shape_reduce(o_shape), deformable_groups);
                 const size_t group_offset_batch_size = shape_size(shape_reduce(o_shape));
-                const size_t deformable_groups_per_group =
-                    std::ceil(static_cast<float>(deformable_groups) / static_cast<float>(groups));
 
                 const size_t group_filters_count = f_shape[filter_out_ch_axis] / groups;
                 const Shape group_filter_shape = shape_reduce(f_shape);
@@ -239,22 +238,20 @@ namespace ngraph
                         for (size_t f_idx = 0; f_idx < group_filters_count; ++f_idx)
                         {
                             convolve_2D_channels(params,
-                                                 deformable_groups_per_group,
                                                  in,
                                                  group_in_shape,
                                                  group_offsets,
                                                  group_offset_shape,
                                                  group_filters,
                                                  group_filter_shape,
-                                                 out);
+                                                 out,
+                                                 group_idx,
+                                                 groups,
+                                                 deformable_groups);
                             group_filters += group_filter_size;
                             out += out_ch_size;
                         }
                         in += group_in_size;
-                        if (deformable_groups > 1)
-                        {
-                            group_offsets += (deformable_groups_per_group * group_offset_size);
-                        }
                     }
                     offsets += group_offset_batch_size;
                 }
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/interpolate.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/interpolate.hpp
index d023aebf53e..4ca1ce76091 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/interpolate.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/interpolate.hpp
@@ -389,6 +389,7 @@ namespace ngraph
             {
                 auto info = helper.get_info_for_linear_mode();
 
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform output_transform(m_out_shape);
                 CoordinateTransform input_transform(m_input_data_shape);
 
@@ -424,6 +425,7 @@ namespace ngraph
                         out[output_transform.index(output_coord)] = static_cast<T>(summa / wsum);
                     }
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
 
             template <typename T>
@@ -586,6 +588,7 @@ namespace ngraph
                 size_t input_rank = m_input_data_shape.size();
                 size_t num_of_axes = m_axes.size();
 
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform output_transform(m_out_shape);
                 CoordinateTransform input_transform(m_input_data_shape);
                 Shape indices_shape{std::vector<size_t>(num_of_axes, 4)};
@@ -631,11 +634,13 @@ namespace ngraph
 
                     out[output_transform.index(output_coord)] = static_cast<T>(summa);
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
 
             template <typename T>
             void InterpolateEval<T>::nearest_func(const T* input_data, T* out)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform output_transform(m_out_shape);
                 CoordinateTransform input_transform(m_input_data_shape);
 
@@ -645,6 +650,7 @@ namespace ngraph
                     out[output_transform.index(output_coord)] =
                         input_data[input_transform.index(input_coord)];
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
 
             template <typename T>
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/log_softmax.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/log_softmax.hpp
index 2cf445ee54f..d04f0025c13 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/log_softmax.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/log_softmax.hpp
@@ -19,6 +19,7 @@ namespace ngraph
             template <typename T>
             void log_softmax(const T* arg, T* out, const Shape& shape, const AxisSet& axes)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 auto temp_shape = reduce(shape, axes, true);
                 auto temp_elements = shape_size(temp_shape);
                 auto temp_max = std::vector<T>(temp_elements, 0);
@@ -44,6 +45,7 @@ namespace ngraph
                         (arg[transform.index(coord)] - temp_max[temp_transform.index(temp_coord)]) -
                         std::log(temp_sum[temp_transform.index(temp_coord)]);
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/lrn.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/lrn.hpp
index f18876fac98..6909c2b0cc5 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/lrn.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/lrn.hpp
@@ -73,6 +73,7 @@ namespace ngraph
                      double dbias,
                      size_t size)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 T alpha = static_cast<T>(dalpha);
                 T beta = static_cast<T>(dbeta);
                 T bias = static_cast<T>(dbias);
@@ -111,6 +112,7 @@ namespace ngraph
                     T x = arg[index];
                     out[index] = x / (std::pow(bias + scale * square_sum, beta));
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/matrix_nms.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/matrix_nms.hpp
new file mode 100644
index 00000000000..c1c045e4349
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/matrix_nms.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <functional>
+#include <map>
+#include <ngraph/runtime/host_tensor.hpp>
+#include <vector>
+#include "ngraph/node.hpp"
+#include "ngraph/op/matrix_nms.hpp"
+#include "ngraph/op/util/op_types.hpp"
+#include "ngraph/ops.hpp"
+#include "ngraph/shape_util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            void matrix_nms(const float* boxes_data,
+                            const Shape& boxes_data_shape,
+                            const float* scores_data,
+                            const Shape& scores_data_shape,
+                            const op::v8::MatrixNms::Attributes& attrs,
+                            float* selected_outputs,
+                            const Shape& selected_outputs_shape,
+                            int64_t* selected_indices,
+                            const Shape& selected_indices_shape,
+                            int64_t* valid_outputs);
+
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/max_pool.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/max_pool.hpp
index 885c2115756..be02005aa99 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/max_pool.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/max_pool.hpp
@@ -24,6 +24,7 @@ namespace ngraph
                           const Shape& padding_below,
                           const Shape& padding_above)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 // At the outermost level we will walk over every output coordinate O.
                 CoordinateTransform output_transform(out_shape);
 
@@ -119,6 +120,7 @@ namespace ngraph
 
                     out[output_transform.index(out_coord)] = result;
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/multiclass_nms.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/multiclass_nms.hpp
new file mode 100644
index 00000000000..fe14f29689a
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/multiclass_nms.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <functional>
+#include <map>
+#include <ngraph/runtime/host_tensor.hpp>
+#include <vector>
+#include "ngraph/node.hpp"
+#include "ngraph/op/util/nms_base.hpp"
+#include "ngraph/op/util/op_types.hpp"
+#include "ngraph/ops.hpp"
+#include "ngraph/shape_util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            void multiclass_nms(const float* boxes_data,
+                                const Shape& boxes_data_shape,
+                                const float* scores_data,
+                                const Shape& scores_data_shape,
+                                const op::v8::MulticlassNms::Attributes& attrs,
+                                float* selected_outputs,
+                                const Shape& selected_outputs_shape,
+                                int64_t* selected_indices,
+                                const Shape& selected_indices_shape,
+                                int64_t* valid_outputs);
+
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/reverse_sequence.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/reverse_sequence.hpp
index 865e1dbbbd1..e0fb59b2b29 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/reverse_sequence.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/reverse_sequence.hpp
@@ -24,6 +24,7 @@ namespace ngraph
                                   size_t sequence_axis,
                                   const U* sequence_lengths)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform input_transform(arg_shape);
                 for (const Coordinate& in_coord : input_transform)
                 {
@@ -51,6 +52,7 @@ namespace ngraph
                     out_coord[sequence_axis] = sequence_index;
                     out[input_transform.index(out_coord)] = arg[input_transform.index(in_coord)];
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/roi_align.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/roi_align.hpp
index 897513c2c8f..f368964b57c 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/roi_align.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/roi_align.hpp
@@ -35,6 +35,7 @@ namespace ngraph
                 auto feature_map_width = feature_maps_shape[3];
                 auto num_rois = rois_shape[0];
 
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform feature_maps_transform(feature_maps_shape);
                 CoordinateTransform rois_transform(rois_shape);
                 CoordinateTransform out_transform(out_shape);
@@ -225,6 +226,7 @@ namespace ngraph
                         tmp_out.clear();
                     }
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
                 return;
             }
         } // namespace reference
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp
index 1d398db722d..cd507a1b21c 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/scatter_elements_update.hpp
@@ -33,6 +33,7 @@ namespace ngraph
                 // output[i][indices[i][j][k]][k] = updates[i][j][k] if axis = 1,
                 // output[i][j][indices[i][j][k]] = updates[i][j][k] if axis = 2
 
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform indices_transform{indices_shape};
                 CoordinateTransform data_transform{data_shape};
 
@@ -47,6 +48,7 @@ namespace ngraph
                                  ".");
                     out_buf[data_transform.index(out_cord)] = updates[indices_idx];
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/scatter_update.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/scatter_update.hpp
index 3805e07773b..c310d054e4a 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/scatter_update.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/scatter_update.hpp
@@ -41,6 +41,7 @@ namespace ngraph
                 //         u_coord in slice updates[..., i_coord, ...]
                 //          data[index(d_coord)] = updates[index(u_coord)]
 
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform indices_transform{indices_shape};
                 CoordinateTransform data_transform{data_shape};
 
@@ -106,6 +107,7 @@ namespace ngraph
                     }
                     updates_indices_coord_iter++;
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/sinh.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/sinh.hpp
index 3712bcd36aa..941ecd0f7f4 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/sinh.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/sinh.hpp
@@ -13,7 +13,8 @@ namespace ngraph
     {
         namespace reference
         {
-            template <typename T>
+            template <typename T,
+                      typename std::enable_if<!std::is_integral<T>::value, bool>::type = true>
             void sinh(const T* arg, T* out, size_t count)
             {
                 for (size_t i = 0; i < count; i++)
@@ -21,6 +22,15 @@ namespace ngraph
                     out[i] = std::sinh(arg[i]);
                 }
             }
+            template <typename T,
+                      typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
+            void sinh(const T* arg, T* out, size_t count)
+            {
+                for (size_t i = 0; i < count; i++)
+                {
+                    out[i] = std::roundl(std::sinh(arg[i]));
+                }
+            }
         } // namespace reference
     }     // namespace runtime
 } // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/softmax.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/softmax.hpp
index a8544177aec..d467714229e 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/softmax.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/softmax.hpp
@@ -25,6 +25,7 @@ namespace ngraph
 
                 max(arg, temp_ptr, shape, axes);
 
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 CoordinateTransform transform(shape);
                 CoordinateTransform temp_transform(temp_shape);
                 for (const Coordinate& coord : transform)
@@ -43,6 +44,7 @@ namespace ngraph
                 }
 
                 delete[] temp_ptr;
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/space_to_depth.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/space_to_depth.hpp
new file mode 100644
index 00000000000..e30e2a59276
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/space_to_depth.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/space_to_depth.hpp"
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            void space_to_depth(const char* const in,
+                                const Shape& in_shape,
+                                char* const out,
+                                const Shape& out_shape,
+                                const size_t block_size,
+                                const op::SpaceToDepth::SpaceToDepthMode mode,
+                                const size_t elem_size);
+        }
+    } // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/topk.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/topk.hpp
index 99ad194c28a..6b7e57f764b 100644
--- a/ngraph/core/reference/include/ngraph/runtime/reference/topk.hpp
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/topk.hpp
@@ -61,6 +61,7 @@ namespace ngraph
                       bool compute_max,
                       op::v1::TopK::SortType sort = op::v1::TopK::SortType::NONE)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 using namespace std;
                 // reorder source axis visit order and make "axis" inner most
                 size_t ndim = static_cast<size_t>(in_shape.size());
@@ -133,6 +134,7 @@ namespace ngraph
                         out_index += out_axis_stride;
                     }
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/include/ngraph/runtime/reference/utils/nms_common.hpp b/ngraph/core/reference/include/ngraph/runtime/reference/utils/nms_common.hpp
new file mode 100644
index 00000000000..b44e8dbaa41
--- /dev/null
+++ b/ngraph/core/reference/include/ngraph/runtime/reference/utils/nms_common.hpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <iterator>
+#include <limits>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+#include "ngraph/type/element_type.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            namespace nms_common
+            {
+                struct Rectangle
+                {
+                    Rectangle(float x_left, float y_left, float x_right, float y_right)
+                        : x1{x_left}
+                        , y1{y_left}
+                        , x2{x_right}
+                        , y2{y_right}
+                    {
+                    }
+
+                    Rectangle() = default;
+
+                    float x1 = 0.0f;
+                    float y1 = 0.0f;
+                    float x2 = 0.0f;
+                    float y2 = 0.0f;
+                };
+
+                struct BoxInfo
+                {
+                    BoxInfo(const Rectangle& r,
+                            int64_t idx,
+                            float sc,
+                            int64_t suppress_idx,
+                            int64_t batch_idx,
+                            int64_t class_idx)
+                        : box{r}
+                        , index{idx}
+                        , suppress_begin_index{suppress_idx}
+                        , batch_index{batch_idx}
+                        , class_index{class_idx}
+                        , score{sc}
+                    {
+                    }
+
+                    BoxInfo() = default;
+
+                    inline bool operator<(const BoxInfo& rhs) const
+                    {
+                        return score < rhs.score || (score == rhs.score && index > rhs.index);
+                    }
+
+                    inline bool operator>(const BoxInfo& rhs) const
+                    {
+                        return !(score < rhs.score || (score == rhs.score && index > rhs.index));
+                    }
+
+                    Rectangle box;
+                    int64_t index = 0;
+                    int64_t suppress_begin_index = 0;
+                    int64_t batch_index = 0;
+                    int64_t class_index = 0;
+                    float score = 0.0f;
+                };
+
+                void nms_common_postprocessing(void* prois,
+                                               void* pscores,
+                                               void* pselected_num,
+                                               const ngraph::element::Type& output_type,
+                                               const std::vector<float>& selected_outputs,
+                                               const std::vector<int64_t>& selected_indices,
+                                               const std::vector<int64_t>& valid_outputs);
+
+            } // namespace nms_common
+        }     // namespace reference
+    }         // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/src/coordinate_transform.cpp b/ngraph/core/reference/src/coordinate_transform.cpp
index e1f849f2bed..9b57790b155 100644
--- a/ngraph/core/reference/src/coordinate_transform.cpp
+++ b/ngraph/core/reference/src/coordinate_transform.cpp
@@ -20,6 +20,7 @@
 #include "ngraph/util.hpp"
 
 using namespace ngraph;
+NGRAPH_SUPPRESS_DEPRECATED_START
 
 namespace
 {
diff --git a/ngraph/core/reference/src/runtime/reference/gather_tree.cpp b/ngraph/core/reference/src/runtime/reference/gather_tree.cpp
index 5ba310c1d9d..5de293f32bd 100644
--- a/ngraph/core/reference/src/runtime/reference/gather_tree.cpp
+++ b/ngraph/core/reference/src/runtime/reference/gather_tree.cpp
@@ -83,6 +83,7 @@ void runtime::reference::gather_tree(const char* step_ids,
         throw ngraph_error("max_seq_len must have size of BATCH_SIZE");
     }
 
+    NGRAPH_SUPPRESS_DEPRECATED_START
     ngraph::CoordinateTransform cordinate_transform(step_ids_shape);
 
     for (const auto& coord : cordinate_transform)
@@ -136,4 +137,5 @@ void runtime::reference::gather_tree(const char* step_ids,
             }
         }
     }
+    NGRAPH_SUPPRESS_DEPRECATED_END
 }
diff --git a/ngraph/core/reference/src/runtime/reference/interpolate.cpp b/ngraph/core/reference/src/runtime/reference/interpolate.cpp
index ac3f54c646b..8e04368edf0 100644
--- a/ngraph/core/reference/src/runtime/reference/interpolate.cpp
+++ b/ngraph/core/reference/src/runtime/reference/interpolate.cpp
@@ -137,6 +137,7 @@ InterpolateEvalHelper::InfoForLinearMode InterpolateEvalHelper::get_info_for_lin
     std::vector<float> a(num_of_axes);
     std::vector<int64_t> r(num_of_axes);
 
+    NGRAPH_SUPPRESS_DEPRECATED_START
     CoordinateTransform output_transform(m_out_shape);
     CoordinateTransform input_transform(m_input_data_shape);
 
@@ -159,6 +160,7 @@ InterpolateEvalHelper::InfoForLinearMode InterpolateEvalHelper::get_info_for_lin
     result.r = r;
     result.prod_a = prod_a;
     result.shape_for_indeces = shape_for_indeces;
+    NGRAPH_SUPPRESS_DEPRECATED_END
 
     return result;
 }
diff --git a/ngraph/core/reference/src/runtime/reference/matrix_nms.cpp b/ngraph/core/reference/src/runtime/reference/matrix_nms.cpp
new file mode 100644
index 00000000000..e82d0ee4de3
--- /dev/null
+++ b/ngraph/core/reference/src/runtime/reference/matrix_nms.cpp
@@ -0,0 +1,354 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/matrix_nms.hpp"
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <queue>
+#include <vector>
+#include "ngraph/runtime/reference/matrix_nms.hpp"
+#include "ngraph/runtime/reference/utils/nms_common.hpp"
+#include "ngraph/shape.hpp"
+
+using namespace ngraph;
+using namespace ngraph::runtime::reference;
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            namespace matrix_nms_v8
+            {
+                template <typename T, bool gaussian>
+                struct decay_score;
+
+                template <typename T>
+                struct decay_score<T, true>
+                {
+                    T operator()(T iou, T max_iou, T sigma)
+                    {
+                        return std::exp((max_iou * max_iou - iou * iou) * sigma);
+                    }
+                };
+
+                template <typename T>
+                struct decay_score<T, false>
+                {
+                    T operator()(T iou, T max_iou, T sigma)
+                    {
+                        return (1. - iou) / (1. - max_iou + 1e-10f);
+                    }
+                };
+
+                template <class T>
+                static inline T BBoxArea(const T* box, const bool normalized)
+                {
+                    if (box[2] < box[0] || box[3] < box[1])
+                    {
+                        // If coordinate values are is invalid
+                        // (e.g. xmax < xmin or ymax < ymin), return 0.
+                        return static_cast<T>(0.);
+                    }
+                    else
+                    {
+                        const T w = box[2] - box[0];
+                        const T h = box[3] - box[1];
+                        if (normalized)
+                        {
+                            return w * h;
+                        }
+                        else
+                        {
+                            // If coordinate values are not within range [0, 1].
+                            return (w + 1) * (h + 1);
+                        }
+                    }
+                }
+
+                template <class T>
+                static inline T
+                    intersectionOverUnion(const T* box1, const T* box2, const bool normalized)
+                {
+                    if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+                        box2[3] < box1[1])
+                    {
+                        return static_cast<T>(0.);
+                    }
+                    else
+                    {
+                        const T inter_xmin = std::max(box1[0], box2[0]);
+                        const T inter_ymin = std::max(box1[1], box2[1]);
+                        const T inter_xmax = std::min(box1[2], box2[2]);
+                        const T inter_ymax = std::min(box1[3], box2[3]);
+                        T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+                        T inter_w = inter_xmax - inter_xmin + norm;
+                        T inter_h = inter_ymax - inter_ymin + norm;
+                        const T inter_area = inter_w * inter_h;
+                        const T bbox1_area = BBoxArea<T>(box1, normalized);
+                        const T bbox2_area = BBoxArea<T>(box2, normalized);
+                        return inter_area / (bbox1_area + bbox2_area - inter_area);
+                    }
+                }
+            } // namespace matrix_nms_v8
+
+            template <typename T, bool gaussian>
+            void nms_matrix(const T* boxes_data,
+                            const Shape& boxes_data_shape,
+                            const T* scores_data,
+                            const Shape& scores_data_shape,
+                            const T score_threshold,
+                            const T post_threshold,
+                            const float sigma,
+                            const int64_t top_k,
+                            const bool normalized,
+                            std::vector<int>* selected_indices,
+                            std::vector<T>* decayed_scores)
+            {
+                int64_t boxes_num = static_cast<int64_t>(boxes_data_shape[1]);
+                int64_t box_size = static_cast<int64_t>(boxes_data_shape[2]);
+
+                std::vector<int32_t> candidate_index(boxes_num);
+                std::iota(candidate_index.begin(), candidate_index.end(), 0);
+                auto end = std::remove_if(candidate_index.begin(),
+                                          candidate_index.end(),
+                                          [&scores_data, score_threshold](int32_t idx) {
+                                              return scores_data[idx] <= score_threshold;
+                                          });
+
+                int64_t original_size = std::distance(candidate_index.begin(), end);
+                if (original_size <= 0)
+                {
+                    return;
+                }
+                if (top_k > -1 && original_size > top_k)
+                {
+                    original_size = top_k;
+                }
+
+                std::partial_sort(candidate_index.begin(),
+                                  candidate_index.begin() + original_size,
+                                  end,
+                                  [&scores_data](int32_t a, int32_t b) {
+                                      return scores_data[a] > scores_data[b];
+                                  });
+
+                std::vector<T> iou_matrix((original_size * (original_size - 1)) >> 1);
+                std::vector<T> iou_max(original_size);
+
+                iou_max[0] = 0.;
+                for (int64_t i = 1; i < original_size; i++)
+                {
+                    T max_iou = 0.;
+                    auto idx_a = candidate_index[i];
+                    for (int64_t j = 0; j < i; j++)
+                    {
+                        auto idx_b = candidate_index[j];
+                        auto iou =
+                            matrix_nms_v8::intersectionOverUnion<T>(boxes_data + idx_a * box_size,
+                                                                    boxes_data + idx_b * box_size,
+                                                                    normalized);
+                        max_iou = std::max(max_iou, iou);
+                        iou_matrix[i * (i - 1) / 2 + j] = iou;
+                    }
+                    iou_max[i] = max_iou;
+                }
+
+                if (scores_data[candidate_index[0]] > post_threshold)
+                {
+                    selected_indices->push_back(candidate_index[0]);
+                    decayed_scores->push_back(scores_data[candidate_index[0]]);
+                }
+
+                matrix_nms_v8::decay_score<T, gaussian> decay_fn;
+                for (int64_t i = 1; i < original_size; i++)
+                {
+                    T min_decay = 1.;
+                    for (int64_t j = 0; j < i; j++)
+                    {
+                        auto max_iou = iou_max[j];
+                        auto iou = iou_matrix[i * (i - 1) / 2 + j];
+                        auto decay = decay_fn(iou, max_iou, sigma);
+                        min_decay = std::min(min_decay, decay);
+                    }
+                    auto ds = min_decay * scores_data[candidate_index[i]];
+                    if (ds <= post_threshold)
+                        continue;
+                    selected_indices->push_back(candidate_index[i]);
+                    decayed_scores->push_back(ds);
+                }
+            }
+
+            void matrix_nms(const float* boxes_data,
+                            const Shape& boxes_data_shape,
+                            const float* scores_data,
+                            const Shape& scores_data_shape,
+                            const op::v8::MatrixNms::Attributes& attrs,
+                            float* selected_outputs,
+                            const Shape& selected_outputs_shape,
+                            int64_t* selected_indices,
+                            const Shape& selected_indices_shape,
+                            int64_t* valid_outputs)
+            {
+                using Rectangle = runtime::reference::nms_common::Rectangle;
+                using BoxInfo = runtime::reference::nms_common::BoxInfo;
+
+                // boxes shape: {num_batches, num_boxes, 4}
+                // scores shape: {num_batches, num_classes, num_boxes}
+                int64_t num_batches = static_cast<int64_t>(scores_data_shape[0]);
+                int64_t num_classes = static_cast<int64_t>(scores_data_shape[1]);
+                int64_t num_boxes = static_cast<int64_t>(boxes_data_shape[1]);
+                int64_t box_shape = static_cast<int64_t>(boxes_data_shape[2]);
+
+                std::vector<int> num_per_batch;
+                std::vector<BoxInfo> filtered_boxes;
+                filtered_boxes.reserve(6 * num_batches * num_classes * num_boxes);
+
+                for (int64_t batch = 0; batch < num_batches; batch++)
+                {
+                    const float* boxesPtr = boxes_data + batch * num_boxes * 4;
+                    std::vector<int> all_indices;
+                    std::vector<float> all_scores;
+                    std::vector<int64_t> all_classes;
+                    size_t num_det = 0;
+
+                    for (int64_t class_idx = 0; class_idx < num_classes; class_idx++)
+                    {
+                        if (class_idx == attrs.background_class)
+                            continue;
+                        const float* scoresPtr =
+                            scores_data + batch * (num_classes * num_boxes) + class_idx * num_boxes;
+                        if (attrs.decay_function == op::v8::MatrixNms::DecayFunction::GAUSSIAN)
+                        {
+                            nms_matrix<float, true>(boxesPtr,
+                                                    boxes_data_shape,
+                                                    scoresPtr,
+                                                    scores_data_shape,
+                                                    attrs.score_threshold,
+                                                    attrs.post_threshold,
+                                                    attrs.gaussian_sigma,
+                                                    attrs.nms_top_k,
+                                                    attrs.normalized,
+                                                    &all_indices,
+                                                    &all_scores);
+                        }
+                        else
+                        {
+                            nms_matrix<float, false>(boxesPtr,
+                                                     boxes_data_shape,
+                                                     scoresPtr,
+                                                     scores_data_shape,
+                                                     attrs.score_threshold,
+                                                     attrs.post_threshold,
+                                                     attrs.gaussian_sigma,
+                                                     attrs.nms_top_k,
+                                                     attrs.normalized,
+                                                     &all_indices,
+                                                     &all_scores);
+                        }
+                        for (size_t i = 0; i < all_indices.size() - num_det; i++)
+                        {
+                            all_classes.push_back(class_idx);
+                        }
+                        num_det = all_indices.size();
+                    }
+
+                    if (num_det <= 0)
+                    {
+                        break;
+                    }
+
+                    if (attrs.keep_top_k > -1)
+                    {
+                        auto k = static_cast<size_t>(attrs.keep_top_k);
+                        if (num_det > k)
+                            num_det = k;
+                    }
+
+                    std::vector<int32_t> perm(all_indices.size());
+                    std::iota(perm.begin(), perm.end(), 0);
+
+                    std::partial_sort(perm.begin(),
+                                      perm.begin() + num_det,
+                                      perm.end(),
+                                      [&all_scores, &all_classes, &all_indices](int lhs, int rhs) {
+                                          return (all_scores[lhs] > all_scores[rhs]) ||
+                                                 (all_scores[lhs] == all_scores[rhs] &&
+                                                  all_classes[lhs] < all_classes[rhs]) ||
+                                                 (all_scores[lhs] == all_scores[rhs] &&
+                                                  all_classes[lhs] == all_classes[rhs] &&
+                                                  all_indices[lhs] < all_indices[rhs]);
+                                      });
+
+                    for (size_t i = 0; i < num_det; i++)
+                    {
+                        auto p = perm[i];
+                        auto idx = all_indices[p];
+                        auto cls = all_classes[p];
+                        auto score = all_scores[p];
+                        auto bbox = boxesPtr + idx * box_shape;
+
+                        filtered_boxes.push_back(
+                            BoxInfo{Rectangle{bbox[0], bbox[1], bbox[2], bbox[3]},
+                                    batch * num_boxes + idx,
+                                    score,
+                                    0,
+                                    batch,
+                                    cls});
+                    }
+                    num_per_batch.push_back(num_det);
+                }
+
+                if (attrs.sort_result_across_batch)
+                { /* sort across batch */
+                    if (attrs.sort_result_type == op::v8::MatrixNms::SortResultType::SCORE)
+                    {
+                        std::sort(
+                            filtered_boxes.begin(),
+                            filtered_boxes.end(),
+                            [](const BoxInfo& l, const BoxInfo& r) {
+                                return (l.score > r.score) ||
+                                       (l.score == r.score && l.batch_index < r.batch_index) ||
+                                       (l.score == r.score && l.batch_index == r.batch_index &&
+                                        l.class_index < r.class_index) ||
+                                       (l.score == r.score && l.batch_index == r.batch_index &&
+                                        l.class_index == r.class_index && l.index < r.index);
+                            });
+                    }
+                    else if (attrs.sort_result_type == op::v8::MatrixNms::SortResultType::CLASSID)
+                    {
+                        std::sort(filtered_boxes.begin(),
+                                  filtered_boxes.end(),
+                                  [](const BoxInfo& l, const BoxInfo& r) {
+                                      return (l.class_index < r.class_index) ||
+                                             (l.class_index == r.class_index &&
+                                              l.batch_index < r.batch_index) ||
+                                             (l.class_index == r.class_index &&
+                                              l.batch_index == r.batch_index &&
+                                              l.score > r.score) ||
+                                             (l.class_index == r.class_index &&
+                                              l.batch_index == r.batch_index &&
+                                              l.score == r.score && l.index < r.index);
+                                  });
+                    }
+                }
+
+                std::copy(num_per_batch.begin(), num_per_batch.end(), valid_outputs);
+                for (size_t i = 0; i < filtered_boxes.size(); i++)
+                {
+                    selected_indices[i] = filtered_boxes[i].index;
+                    auto selected_base = selected_outputs + i * 6;
+                    selected_base[0] = filtered_boxes[i].class_index;
+                    selected_base[1] = filtered_boxes[i].score;
+                    selected_base[2] = filtered_boxes[i].box.x1;
+                    selected_base[3] = filtered_boxes[i].box.y1;
+                    selected_base[4] = filtered_boxes[i].box.x2;
+                    selected_base[5] = filtered_boxes[i].box.y2;
+                }
+            }
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/src/runtime/reference/multiclass_nms.cpp b/ngraph/core/reference/src/runtime/reference/multiclass_nms.cpp
new file mode 100644
index 00000000000..3328de9c3aa
--- /dev/null
+++ b/ngraph/core/reference/src/runtime/reference/multiclass_nms.cpp
@@ -0,0 +1,350 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/multiclass_nms.hpp"
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <queue>
+#include <vector>
+#include "ngraph/runtime/reference/multiclass_nms.hpp"
+#include "ngraph/runtime/reference/utils/nms_common.hpp"
+#include "ngraph/shape.hpp"
+
+using namespace ngraph;
+using namespace ngraph::runtime::reference;
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            namespace multiclass_nms_v8
+            {
+                using Rectangle = runtime::reference::nms_common::Rectangle;
+                using BoxInfo = runtime::reference::nms_common::BoxInfo;
+                static float intersectionOverUnion(const Rectangle& boxI,
+                                                   const Rectangle& boxJ,
+                                                   const bool normalized)
+                {
+                    const float norm = static_cast<float>(normalized == false);
+
+                    float areaI = (boxI.y2 - boxI.y1 + norm) * (boxI.x2 - boxI.x1 + norm);
+                    float areaJ = (boxJ.y2 - boxJ.y1 + norm) * (boxJ.x2 - boxJ.x1 + norm);
+
+                    if (areaI <= 0.0f || areaJ <= 0.0f)
+                    {
+                        return 0.0f;
+                    }
+
+                    float intersection_ymin = std::max(boxI.y1, boxJ.y1);
+                    float intersection_xmin = std::max(boxI.x1, boxJ.x1);
+                    float intersection_ymax = std::min(boxI.y2, boxJ.y2);
+                    float intersection_xmax = std::min(boxI.x2, boxJ.x2);
+
+                    float intersection_area =
+                        std::max(intersection_ymax - intersection_ymin + norm, 0.0f) *
+                        std::max(intersection_xmax - intersection_xmin + norm, 0.0f);
+
+                    return intersection_area / (areaI + areaJ - intersection_area);
+                }
+
+                struct SelectedIndex
+                {
+                    SelectedIndex(int64_t batch_idx, int64_t box_idx, int64_t num_box)
+                        : flattened_index(batch_idx * num_box + box_idx)
+                    {
+                    }
+
+                    SelectedIndex() = default;
+
+                    int64_t flattened_index = 0;
+                };
+
+                struct SelectedOutput
+                {
+                    SelectedOutput(
+                        float class_idx, float score, float x1, float y1, float x2, float y2)
+                        : class_index{class_idx}
+                        , box_score{score}
+                        , xmin{x1}
+                        , ymin{y1}
+                        , xmax{x2}
+                        , ymax{y2}
+                    {
+                    }
+
+                    SelectedOutput() = default;
+
+                    float class_index = 0.0f;
+                    float box_score = 0.0f;
+                    float xmin, ymin, xmax, ymax;
+                };
+            } // namespace multiclass_nms_v8
+
+            void multiclass_nms(const float* boxes_data,
+                                const Shape& boxes_data_shape,
+                                const float* scores_data,
+                                const Shape& scores_data_shape,
+                                const op::v8::MulticlassNms::Attributes& attrs,
+                                float* selected_outputs,
+                                const Shape& selected_outputs_shape,
+                                int64_t* selected_indices,
+                                const Shape& selected_indices_shape,
+                                int64_t* valid_outputs)
+            {
+                using SelectedIndex = multiclass_nms_v8::SelectedIndex;
+                using SelectedOutput = multiclass_nms_v8::SelectedOutput;
+                using BoxInfo = multiclass_nms_v8::BoxInfo;
+                using Rectangle = multiclass_nms_v8::Rectangle;
+
+                auto func = [](float iou, float adaptive_threshold) {
+                    return iou <= adaptive_threshold ? 1.0f : 0.0f;
+                };
+
+                // boxes shape: {num_batches, num_boxes, 4}
+                // scores shape: {num_batches, num_classes, num_boxes}
+                int64_t num_batches = static_cast<int64_t>(scores_data_shape[0]);
+                int64_t num_classes = static_cast<int64_t>(scores_data_shape[1]);
+                int64_t num_boxes = static_cast<int64_t>(boxes_data_shape[1]);
+
+                SelectedIndex* selected_indices_ptr =
+                    reinterpret_cast<SelectedIndex*>(selected_indices);
+                SelectedOutput* selected_scores_ptr =
+                    reinterpret_cast<SelectedOutput*>(selected_outputs);
+
+                std::vector<BoxInfo> filteredBoxes; // container for the whole batch
+
+                for (int64_t batch = 0; batch < num_batches; batch++)
+                {
+                    const float* boxesPtr = boxes_data + batch * num_boxes * 4;
+                    Rectangle* r = reinterpret_cast<Rectangle*>(const_cast<float*>(boxesPtr));
+
+                    int64_t num_dets = 0;
+                    std::vector<BoxInfo> selected_boxes; // container for a batch element
+
+                    for (int64_t class_idx = 0; class_idx < num_classes; class_idx++)
+                    {
+                        if (class_idx == attrs.background_class)
+                            continue;
+
+                        auto adaptive_threshold = attrs.iou_threshold;
+
+                        const float* scoresPtr =
+                            scores_data + batch * (num_classes * num_boxes) + class_idx * num_boxes;
+
+                        std::vector<BoxInfo> candidate_boxes;
+
+                        for (int64_t box_idx = 0; box_idx < num_boxes; box_idx++)
+                        {
+                            if (scoresPtr[box_idx] >=
+                                attrs.score_threshold) /* NOTE: ">=" instead of ">" used in PDPD */
+                            {
+                                candidate_boxes.emplace_back(
+                                    r[box_idx], box_idx, scoresPtr[box_idx], 0, batch, class_idx);
+                            }
+                        }
+
+                        int candiate_size = candidate_boxes.size();
+
+                        // threshold nms_top_k for each class
+                        // NOTE: "nms_top_k" in PDPD not exactly equal to
+                        // "max_output_boxes_per_class" in ONNX.
+                        if (attrs.nms_top_k > -1 && attrs.nms_top_k < candiate_size)
+                        {
+                            candiate_size = attrs.nms_top_k;
+                        }
+
+                        if (candiate_size <= 0) // early drop
+                        {
+                            continue;
+                        }
+
+                        // sort by score in current class
+                        std::partial_sort(candidate_boxes.begin(),
+                                          candidate_boxes.begin() + candiate_size,
+                                          candidate_boxes.end(),
+                                          std::greater<BoxInfo>());
+
+                        std::priority_queue<BoxInfo> sorted_boxes(candidate_boxes.begin(),
+                                                                  candidate_boxes.begin() +
+                                                                      candiate_size,
+                                                                  std::less<BoxInfo>());
+
+                        std::vector<BoxInfo> selected; // container for a class
+
+                        // Get the next box with top score, filter by iou_threshold
+                        BoxInfo next_candidate;
+                        float original_score;
+
+                        while (!sorted_boxes.empty())
+                        {
+                            next_candidate = sorted_boxes.top();
+                            original_score = next_candidate.score;
+                            sorted_boxes.pop();
+
+                            bool should_hard_suppress = false;
+                            for (int64_t j = static_cast<int64_t>(selected.size()) - 1;
+                                 j >= next_candidate.suppress_begin_index;
+                                 --j)
+                            {
+                                float iou = multiclass_nms_v8::intersectionOverUnion(
+                                    next_candidate.box, selected[j].box, attrs.normalized);
+                                next_candidate.score *= func(iou, adaptive_threshold);
+
+                                if (iou >= adaptive_threshold)
+                                {
+                                    should_hard_suppress = true;
+                                    break;
+                                }
+
+                                if (next_candidate.score <= attrs.score_threshold)
+                                {
+                                    break;
+                                }
+                            }
+
+                            next_candidate.suppress_begin_index = selected.size();
+
+                            if (!should_hard_suppress)
+                            {
+                                if (attrs.nms_eta < 1 && adaptive_threshold > 0.5)
+                                {
+                                    adaptive_threshold *= attrs.nms_eta;
+                                }
+                                if (next_candidate.score == original_score)
+                                {
+                                    selected.push_back(next_candidate);
+                                    continue;
+                                }
+                                if (next_candidate.score > attrs.score_threshold)
+                                {
+                                    sorted_boxes.push(next_candidate);
+                                }
+                            }
+                        }
+
+                        for (const auto& box_info : selected)
+                        {
+                            selected_boxes.push_back(box_info);
+                        }
+                        num_dets += selected.size();
+                    } // for each class
+
+                    // sort inside batch element before go through keep_top_k
+                    std::sort(selected_boxes.begin(),
+                              selected_boxes.end(),
+                              [](const BoxInfo& l, const BoxInfo& r) {
+                                  return ((l.batch_index == r.batch_index) &&
+                                          ((l.score > r.score) ||
+                                           ((std::fabs(l.score - r.score) < 1e-6) &&
+                                            l.class_index < r.class_index) ||
+                                           ((std::fabs(l.score - r.score) < 1e-6) &&
+                                            l.class_index == r.class_index && l.index < r.index)));
+                              });
+
+                    // threshold keep_top_k for each batch element
+                    if (attrs.keep_top_k > -1 && attrs.keep_top_k < num_dets)
+                    {
+                        num_dets = attrs.keep_top_k;
+                        selected_boxes.resize(num_dets);
+                    }
+
+                    // sort
+                    if (!attrs.sort_result_across_batch)
+                    {
+                        if (attrs.sort_result_type ==
+                            op::v8::MulticlassNms::SortResultType::CLASSID)
+                        {
+                            std::sort(
+                                selected_boxes.begin(),
+                                selected_boxes.end(),
+                                [](const BoxInfo& l, const BoxInfo& r) {
+                                    return (
+                                        (l.batch_index == r.batch_index) &&
+                                        ((l.class_index < r.class_index) ||
+                                         ((l.class_index == r.class_index) && l.score > r.score) ||
+                                         ((std::fabs(l.score - r.score) <= 1e-6) &&
+                                          l.class_index == r.class_index && l.index < r.index)));
+                                });
+                        }
+                        // in case of "SCORE", pass through, as,
+                        // it has already gurranteed.
+                    }
+
+                    *valid_outputs++ = num_dets;
+                    for (auto& v : selected_boxes)
+                    {
+                        filteredBoxes.push_back(v);
+                    }
+                } // for each batch element
+
+                if (attrs.sort_result_across_batch)
+                { /* sort across batch */
+                    if (attrs.sort_result_type == op::v8::MulticlassNms::SortResultType::SCORE)
+                    {
+                        std::sort(
+                            filteredBoxes.begin(),
+                            filteredBoxes.end(),
+                            [](const BoxInfo& l, const BoxInfo& r) {
+                                return (l.score > r.score) ||
+                                       (l.score == r.score && l.batch_index < r.batch_index) ||
+                                       (l.score == r.score && l.batch_index == r.batch_index &&
+                                        l.class_index < r.class_index) ||
+                                       (l.score == r.score && l.batch_index == r.batch_index &&
+                                        l.class_index == r.class_index && l.index < r.index);
+                            });
+                    }
+                    else if (attrs.sort_result_type ==
+                             op::v8::MulticlassNms::SortResultType::CLASSID)
+                    {
+                        std::sort(filteredBoxes.begin(),
+                                  filteredBoxes.end(),
+                                  [](const BoxInfo& l, const BoxInfo& r) {
+                                      return (l.class_index < r.class_index) ||
+                                             (l.class_index == r.class_index &&
+                                              l.batch_index < r.batch_index) ||
+                                             (l.class_index == r.class_index &&
+                                              l.batch_index == r.batch_index &&
+                                              l.score > r.score) ||
+                                             (l.class_index == r.class_index &&
+                                              l.batch_index == r.batch_index &&
+                                              l.score == r.score && l.index < r.index);
+                                  });
+                    }
+                }
+
+                /* output */
+
+                size_t max_num_of_selected_indices = selected_indices_shape[0];
+                size_t output_size = std::min(filteredBoxes.size(), max_num_of_selected_indices);
+
+                size_t idx;
+                for (idx = 0; idx < output_size; idx++)
+                {
+                    const auto& box_info = filteredBoxes[idx];
+                    SelectedIndex selected_index{box_info.batch_index, box_info.index, num_boxes};
+                    SelectedOutput selected_score{static_cast<float>(box_info.class_index),
+                                                  box_info.score,
+                                                  box_info.box.x1,
+                                                  box_info.box.y1,
+                                                  box_info.box.x2,
+                                                  box_info.box.y2};
+
+                    selected_indices_ptr[idx] = selected_index;
+                    selected_scores_ptr[idx] = selected_score;
+                }
+
+                SelectedIndex selected_index_filler{0, 0, 0};
+                SelectedOutput selected_score_filler{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+                for (; idx < max_num_of_selected_indices; idx++)
+                {
+                    selected_indices_ptr[idx] = selected_index_filler;
+                    selected_scores_ptr[idx] = selected_score_filler;
+                }
+            }
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/src/runtime/reference/slice.cpp b/ngraph/core/reference/src/runtime/reference/slice.cpp
index 66a9c0898b1..1a221ccc8d7 100644
--- a/ngraph/core/reference/src/runtime/reference/slice.cpp
+++ b/ngraph/core/reference/src/runtime/reference/slice.cpp
@@ -24,6 +24,7 @@ namespace ngraph
                        const Shape& out_shape,
                        size_t elem_size)
             {
+                NGRAPH_SUPPRESS_DEPRECATED_START
                 const CoordinateTransform input_transform(
                     arg_shape, lower_bounds, upper_bounds, strides);
 
@@ -45,6 +46,7 @@ namespace ngraph
                         std::advance(dst_mem, elem_size);
                     }
                 }
+                NGRAPH_SUPPRESS_DEPRECATED_END
             }
         } // namespace reference
     }     // namespace runtime
diff --git a/ngraph/core/reference/src/runtime/reference/space_to_depth.cpp b/ngraph/core/reference/src/runtime/reference/space_to_depth.cpp
new file mode 100644
index 00000000000..11b88acc17c
--- /dev/null
+++ b/ngraph/core/reference/src/runtime/reference/space_to_depth.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/runtime/reference/space_to_depth.hpp"
+#include <vector>
+#include "ngraph/check.hpp"
+#include "ngraph/runtime/opt_kernel/reshape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            void space_to_depth(const char* const in,
+                                const Shape& in_shape,
+                                char* const out,
+                                const Shape& out_shape,
+                                const size_t block_size,
+                                const op::SpaceToDepth::SpaceToDepthMode mode,
+                                const size_t elem_size)
+            {
+                // SpaceToDepth run in tree steps:
+                // - disperse data from depth channel
+                // - rearrange data so as appropriate chunks of data where close to their
+                //   destination place
+                // - squeeze data from respective dimensions
+                //
+                // First and third step doesn't change input data in memory, it change only the
+                // shape of input. From data layout perspective firs and third step may be
+                // omit. The second operation have to be perform on input data with dispared
+                // shape (x').
+                const size_t n_dim = in_shape.at(0);
+                const size_t c_dim = in_shape.at(1);
+                const size_t spatial_dim_index = 2;
+                const size_t spatial_dims = in_shape.size() - spatial_dim_index;
+
+                for (size_t i = spatial_dim_index; i < in_shape.size(); ++i)
+                {
+                    NGRAPH_CHECK(block_size > 0 && in_shape.at(i) % block_size == 0,
+                                 "SpaceToDepth: The dimension on position: ",
+                                 i,
+                                 " equal to: ",
+                                 in_shape.at(i),
+                                 " must be a multiple of blocksize: ",
+                                 block_size);
+                }
+
+                Shape dispersed_shape{n_dim, c_dim};
+                for (size_t i = 0; i < spatial_dims; ++i)
+                {
+                    dispersed_shape.push_back(in_shape.at(i + spatial_dim_index) / block_size);
+                    dispersed_shape.push_back(block_size);
+                }
+
+                // calculate axes to transpose
+                // [0, 3, 5, ..., spatial_dims + (spatial_dims + 1), 2, 4, ..., K + K])
+                std::vector<size_t> axes_order{0};
+                for (size_t i = 0, j = 3; i < spatial_dims; ++i, j += 2)
+                {
+                    axes_order.push_back(j);
+                }
+                for (size_t i = 0, j = 2; i < spatial_dims; ++i, j += 2)
+                {
+                    axes_order.push_back(j);
+                }
+
+                switch (mode)
+                {
+                // x' = reshape(data, [N, C, D1/block_size, block_size, D2/block_size, block_size,
+                //              ..., DK/block_size, block_size])
+                // x'' = transpose(x', [0,  1, 3, 5, ..., K + (K + 1),  2, 4, ..., K + K])
+                // y = reshape(x'', [N, C * (block_size ^ K), D1 / block_size, D2 / block_size, ...,
+                //             DK / block_size])
+                case op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST:
+                {
+                    axes_order.insert(axes_order.begin() + 1, 1);
+                    break;
+                }
+                // x' = reshape(data, [N, C, D1/block_size, block_size, D2/block_size, block_size,
+                //              ... , DK/block_size, block_size])
+                // x'' = transpose(x',  [0,  3, 5, ..., K + (K + 1), 1,  2, 4, ..., K + K])
+                // y = reshape(x'', [N, C * (block_size ^ K), D1 / block_size, D2 / block_size, ...,
+                //             DK / block_size])
+                case op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST:
+                {
+                    axes_order.insert(axes_order.begin() + spatial_dims + 1, 1);
+                }
+                }
+
+                Shape post_transpose_shape(axes_order.size());
+                for (size_t axis_idx = 0; axis_idx < axes_order.size(); ++axis_idx)
+                {
+                    post_transpose_shape[axis_idx] = dispersed_shape[axes_order[axis_idx]];
+                }
+
+                runtime::opt_kernel::reshape(
+                    in, out, dispersed_shape, axes_order, post_transpose_shape, elem_size);
+            }
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/reference/src/runtime/reference/utils/nms_common.cpp b/ngraph/core/reference/src/runtime/reference/utils/nms_common.cpp
new file mode 100644
index 00000000000..c658b0f0cad
--- /dev/null
+++ b/ngraph/core/reference/src/runtime/reference/utils/nms_common.cpp
@@ -0,0 +1,72 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <cstring>
+#include <numeric>
+
+#include "ngraph/runtime/reference/utils/nms_common.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            namespace nms_common
+            {
+                void nms_common_postprocessing(void* prois,
+                                               void* pscores,
+                                               void* pselected_num,
+                                               const ngraph::element::Type& output_type,
+                                               const std::vector<float>& selected_outputs,
+                                               const std::vector<int64_t>& selected_indices,
+                                               const std::vector<int64_t>& valid_outputs)
+                {
+                    int64_t total_num =
+                        std::accumulate(valid_outputs.begin(), valid_outputs.end(), 0);
+
+                    float* ptr = static_cast<float*>(prois);
+                    memcpy(ptr, selected_outputs.data(), total_num * sizeof(float) * 6);
+
+                    if (pscores)
+                    {
+                        if (output_type == ngraph::element::i64)
+                        {
+                            int64_t* indices_ptr = static_cast<int64_t*>(pscores);
+                            memcpy(
+                                indices_ptr, selected_indices.data(), total_num * sizeof(int64_t));
+                        }
+                        else
+                        {
+                            int32_t* indices_ptr = static_cast<int32_t*>(pscores);
+                            for (int64_t i = 0; i < total_num; ++i)
+                            {
+                                indices_ptr[i] = static_cast<int32_t>(selected_indices[i]);
+                            }
+                        }
+                    }
+
+                    if (pselected_num)
+                    {
+                        if (output_type == ngraph::element::i64)
+                        {
+                            int64_t* valid_outputs_ptr = static_cast<int64_t*>(pselected_num);
+                            std::copy(
+                                valid_outputs.begin(), valid_outputs.end(), valid_outputs_ptr);
+                        }
+                        else
+                        {
+                            int32_t* valid_outputs_ptr = static_cast<int32_t*>(pselected_num);
+                            for (size_t i = 0; i < valid_outputs.size(); ++i)
+                            {
+                                valid_outputs_ptr[i] = static_cast<int32_t>(valid_outputs[i]);
+                            }
+                        }
+                    }
+                }
+            } // namespace nms_common
+        }     // namespace reference
+    }         // namespace runtime
+} // namespace ngraph
diff --git a/ngraph/core/src/op/atan.cpp b/ngraph/core/src/op/atan.cpp
index c3f1cce9fb4..9344f7ea0b2 100644
--- a/ngraph/core/src/op/atan.cpp
+++ b/ngraph/core/src/op/atan.cpp
@@ -23,7 +23,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::Atan::type_info;
+NGRAPH_RTTI_DEFINITION(op::v0::Atan, "Atan", 0, util::UnaryElementwiseArithmetic);
 
 op::Atan::Atan(const Output<Node>& arg)
     : UnaryElementwiseArithmetic(arg)
diff --git a/ngraph/core/src/op/batch_to_space.cpp b/ngraph/core/src/op/batch_to_space.cpp
index 40feec46384..8e2e98553cf 100644
--- a/ngraph/core/src/op/batch_to_space.cpp
+++ b/ngraph/core/src/op/batch_to_space.cpp
@@ -23,7 +23,7 @@
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::v1::BatchToSpace::type_info;
+NGRAPH_RTTI_DEFINITION(op::v1::BatchToSpace, "BatchToSpace", 1);
 
 ngraph::op::v1::BatchToSpace::BatchToSpace(const ngraph::Output<ngraph::Node>& data,
                                            const ngraph::Output<ngraph::Node>& block_shape,
@@ -37,83 +37,135 @@ ngraph::op::v1::BatchToSpace::BatchToSpace(const ngraph::Output<ngraph::Node>& d
 void op::v1::BatchToSpace::validate_and_infer_types()
 {
     NGRAPH_OP_SCOPE(v1_BatchToSpace_validate_and_infer_types);
-    PartialShape data_pshape = get_input_partial_shape(0);
 
-    const auto& data_type = get_input_element_type(0);
-    const auto& block_shape_type = get_input_element_type(1);
-    const auto& crops_begin_type = get_input_element_type(2);
-    const auto& crops_end_type = get_input_element_type(3);
+    const auto& data_et = get_input_element_type(0);
+    const auto& block_shape_et = get_input_element_type(1);
+    const auto& crops_begin_et = get_input_element_type(2);
+    const auto& crops_end_et = get_input_element_type(3);
+
+    element::Type inputs_integer_et{};
+    NODE_VALIDATION_CHECK(
+        this,
+        element::Type::merge(inputs_integer_et, crops_begin_et, crops_end_et) &&
+            element::Type::merge(inputs_integer_et, inputs_integer_et, block_shape_et),
+        "block_shape, crops_begin and crops_end inputs must have same element type. Got: ",
+        block_shape_et,
+        ", ",
+        crops_begin_et,
+        " and ",
+        crops_end_et);
 
     NODE_VALIDATION_CHECK(this,
-                          block_shape_type.is_integral_number(),
-                          "block_shape must be an integral number but got (",
-                          block_shape_type,
-                          ").");
+                          inputs_integer_et.is_integral_number(),
+                          "block_shape and crops inputs must have integer element type. Got: ",
+                          inputs_integer_et);
 
+    const PartialShape& data_pshape = get_input_partial_shape(0);
+    const PartialShape& block_shape_ps = get_input_partial_shape(1);
+    const PartialShape& crops_begin_ps = get_input_partial_shape(2);
+    const PartialShape& crops_end_ps = get_input_partial_shape(3);
+
+    PartialShape inputs_same_ps{PartialShape::dynamic()};
+    NODE_VALIDATION_CHECK(
+        this,
+        PartialShape::merge_into(inputs_same_ps, crops_begin_ps) &&
+            PartialShape::merge_into(inputs_same_ps, crops_end_ps) &&
+            PartialShape::merge_into(inputs_same_ps, block_shape_ps),
+        "block_shape, crops_begin and crops_end inputs must have the same shape. Got: ",
+        block_shape_ps,
+        ", ",
+        crops_begin_ps,
+        " and ",
+        crops_end_ps);
+
+    const Rank inputs_rank_one = inputs_same_ps.rank();
     NODE_VALIDATION_CHECK(this,
-                          crops_begin_type.is_integral_number(),
-                          "crops_begin must be an integral number but got (",
-                          crops_begin_type,
-                          ").");
+                          inputs_rank_one.compatible(1),
+                          "block_shape and crops inputs must have rank 1. Got: ",
+                          inputs_rank_one);
 
-    NODE_VALIDATION_CHECK(this,
-                          crops_end_type.is_integral_number(),
-                          "crops_end must be an integral number but got (",
-                          crops_end_type,
-                          ").");
+    const Rank data_rank = data_pshape.rank();
+    if (data_rank.is_static())
+    {
+        NODE_VALIDATION_CHECK(this,
+                              (data_rank.get_length() >= 4),
+                              "data input must have rank greater than or equal to 4. Got: ",
+                              data_rank.get_length());
 
-    auto data = input_value(0);
-    auto block = input_value(1);
-    auto crops_begin = input_value(2);
-    auto crops_end = input_value(3);
+        if (inputs_same_ps.is_static())
+        {
+            NODE_VALIDATION_CHECK(this,
+                                  data_rank.get_length() == inputs_same_ps[0].get_length(),
+                                  "block_shape and crop inputs must have same number of elements "
+                                  "as data input rank. Got: ",
+                                  inputs_same_ps[0],
+                                  " and ",
+                                  data_rank);
+        }
+    }
 
-    auto block_const = get_constant_from_source(block);
-    auto crops_begin_const = get_constant_from_source(crops_begin);
-    auto crops_end_const = get_constant_from_source(crops_end);
+    const auto block_const = get_constant_from_source(input_value(1));
+    const auto crops_begin_const = get_constant_from_source(input_value(2));
+    const auto crops_end_const = get_constant_from_source(input_value(3));
 
     if (block_const && crops_begin_const && crops_end_const && data_pshape.is_static())
     {
-        const auto& data_shape = data.get_shape();
-
-        NODE_VALIDATION_CHECK(
-            this,
-            (data_shape.size() >= 2),
-            "The data tensor with rank lower than 2 is not supported (data rank: ",
-            data_shape.size(),
-            ")");
+        const Shape& data_sshape = data_pshape.to_shape();
 
         auto block_val = block_const->cast_vector<int64_t>();
         auto crops_begin_val = crops_begin_const->cast_vector<int64_t>();
         auto crops_end_val = crops_end_const->cast_vector<int64_t>();
 
-        int64_t block_prod = 1;
-        for (long val : block_val)
-        {
-            NODE_VALIDATION_CHECK(this, val > 0, "block_shape values must be greater than 0");
-            block_prod *= val;
-        }
+        bool block_vals_valid =
+            std::all_of(begin(block_val), end(block_val), [](int64_t elem) { return elem >= 1; });
+        NODE_VALIDATION_CHECK(this,
+                              block_vals_valid,
+                              "Elements of block_shape input must be greater or equal to one.");
+
+        bool crops_begin_vals_valid = std::all_of(
+            begin(crops_begin_val), end(crops_begin_val), [](int64_t elem) { return elem >= 0; });
+        bool crops_end_vals_valid = std::all_of(
+            begin(crops_end_val), end(crops_end_val), [](int64_t elem) { return elem >= 0; });
+        NODE_VALIDATION_CHECK(
+            this,
+            crops_begin_vals_valid && crops_end_vals_valid,
+            "Elements of crops_begin and crops_end inputs must be greater or equal to zero.");
+
+        int64_t block_prod =
+            std::accumulate(begin(block_val), end(block_val), 1, std::multiplies<int64_t>());
 
         NODE_VALIDATION_CHECK(this,
-                              data_shape.at(0) % block_prod == 0,
-                              "BatchToSpace: The input data's 'batch' axis size: ",
-                              data_shape.at(0),
-                              " must be a multiple of ",
+                              data_sshape[0] % block_prod == 0,
+                              "The input data's 'batch' axis size: ",
+                              data_sshape[0],
+                              " must be a multiple of",
                               " product of block_shape values: ",
                               block_prod);
 
-        Shape output_shape = {static_cast<size_t>(data_shape[0] / block_prod)};
-        for (size_t idx = 1; idx < data_shape.size(); ++idx)
+        for (size_t idx = 0; idx < data_sshape.size(); idx++)
         {
-            output_shape.push_back(static_cast<size_t>(data_shape[idx] * block_val[idx] -
-                                                       crops_begin_val[idx] - crops_end_val[idx]));
+            const bool is_valid_crops_and_shape =
+                crops_begin_val[idx] + crops_end_val[idx] <=
+                block_val[idx] * static_cast<int64_t>(data_sshape[idx]);
+            NODE_VALIDATION_CHECK(this,
+                                  is_valid_crops_and_shape,
+                                  "crops_begin[i] + crops_end[i] must be less or equal to "
+                                  "block_shape[i] * input_shape[i]");
+        }
+
+        Shape output_sshape = {static_cast<size_t>(data_sshape[0] / block_prod)};
+        for (size_t idx = 1; idx < data_sshape.size(); ++idx)
+        {
+            output_sshape.push_back(static_cast<size_t>(data_sshape[idx] * block_val[idx] -
+                                                        crops_begin_val[idx] - crops_end_val[idx]));
         }
 
         set_output_size(1);
-        set_output_type(0, data_type, output_shape);
+        set_output_type(0, data_et, output_sshape);
     }
     else
     {
-        set_output_type(0, data_type, PartialShape::dynamic(data_pshape.rank()));
+        set_output_type(0, data_et, PartialShape::dynamic(data_rank));
     }
 }
 
@@ -144,16 +196,52 @@ namespace
             return false;
         }
         auto data_shape = data->get_shape();
-
-        if (!(data->get_shape().size() == 4 || data->get_shape().size() == 5))
+        auto data_rank = data_shape.size();
+        if (!(data_rank == 4 || data_rank == 5))
         {
             return false;
         }
+
         size_t block_values_size = shape_size(inputs[1]->get_shape());
+        size_t crops_begin_size = shape_size(inputs[2]->get_shape());
+        size_t crops_end_size = shape_size(inputs[3]->get_shape());
+        NGRAPH_CHECK(
+            block_values_size == data_rank && crops_begin_size == data_rank &&
+                crops_end_size == data_rank,
+            "Invalid block_shape/crops_begin/crops_end shape with respect to rank of data input");
+
         const auto* block_values = inputs[1]->get_data_ptr<int64_t>();
         const auto* crops_begin_values = inputs[2]->get_data_ptr<int64_t>();
         const auto* crops_end_values = inputs[3]->get_data_ptr<int64_t>();
 
+        const bool block_vals_valid = std::all_of(
+            block_values, block_values + block_values_size, [](int64_t elem) { return elem >= 1; });
+        NGRAPH_CHECK(block_vals_valid, "Invalid element values of block_shape input");
+
+        const bool crops_begin_vals_valid = std::all_of(crops_begin_values,
+                                                        crops_begin_values + crops_begin_size,
+                                                        [](int64_t elem) { return elem >= 0; });
+        const bool crops_end_vals_valid = std::all_of(crops_end_values,
+                                                      crops_end_values + crops_end_size,
+                                                      [](int64_t elem) { return elem >= 0; });
+        NGRAPH_CHECK(crops_begin_vals_valid && crops_end_vals_valid,
+                     "Invalid element values of crops_begin/crops_end input/s");
+
+        const std::size_t block_prod = std::accumulate(
+            block_values, block_values + block_values_size, 1UL, std::multiplies<std::size_t>());
+        NGRAPH_CHECK(data_shape[0] % block_prod == 0,
+                     "Invalid batch axis of data input with respect to block_shape values");
+
+        for (size_t i = 0; i < data_rank; i++)
+        {
+            const bool is_valid_crops_and_shape =
+                crops_begin_values[i] + crops_end_values[i] <=
+                block_values[i] * static_cast<int64_t>(data_shape[i]);
+            NGRAPH_CHECK(
+                is_valid_crops_and_shape,
+                "Invalid crops values (out of bounds) with respect to the shape of data input");
+        }
+
         Shape dispersed_shape(1);
         dispersed_shape.insert(dispersed_shape.end(), data_shape.begin(), data_shape.end());
         std::vector<size_t> axes_order(block_values_size + 1);
@@ -249,7 +337,9 @@ namespace
 bool ngraph::op::v1::BatchToSpace::evaluate(const HostTensorVector& outputs,
                                             const HostTensorVector& inputs) const
 {
-    NGRAPH_OP_SCOPE(v1_BatchToSpace);
+    NGRAPH_OP_SCOPE(v1_BatchToSpace_evaluate);
+    NGRAPH_CHECK(validate_host_tensor_vector(inputs, 4));
+    NGRAPH_CHECK(validate_host_tensor_vector(outputs, 1));
     return batch_to_space_evaluate(outputs, inputs);
 }
 
diff --git a/ngraph/core/src/op/interpolate.cpp b/ngraph/core/src/op/interpolate.cpp
index b39c0cfc23d..a0ea8f955d0 100644
--- a/ngraph/core/src/op/interpolate.cpp
+++ b/ngraph/core/src/op/interpolate.cpp
@@ -439,6 +439,7 @@ static void pad_input_data(const uint8_t* data_ptr,
                            const Shape& padded_input_shape,
                            const std::vector<size_t>& pads_begin)
 {
+    NGRAPH_SUPPRESS_DEPRECATED_START
     CoordinateTransform input_transform(input_shape);
     CoordinateTransform padded_transform(padded_input_shape);
 
@@ -455,6 +456,7 @@ static void pad_input_data(const uint8_t* data_ptr,
         const uint8_t* src_ptr = data_ptr + type_size * input_transform.index(input_coord);
         memcpy(dst_ptr, src_ptr, type_size);
     }
+    NGRAPH_SUPPRESS_DEPRECATED_END
 }
 
 bool op::v4::Interpolate::evaluate_interpolate(const HostTensorVector& outputs,
diff --git a/ngraph/core/src/op/matrix_nms.cpp b/ngraph/core/src/op/matrix_nms.cpp
new file mode 100644
index 00000000000..7d3731f3b11
--- /dev/null
+++ b/ngraph/core/src/op/matrix_nms.cpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/matrix_nms.hpp"
+#include <cstring>
+#include <ngraph/validation_util.hpp>
+#include "itt.hpp"
+#include "ngraph/attribute_visitor.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/util/op_types.hpp"
+#include "ngraph/runtime/reference/matrix_nms.hpp"
+#include "ngraph/type/bfloat16.hpp"
+#include "ngraph/type/float16.hpp"
+#include "ngraph/util.hpp"
+
+using namespace ngraph;
+
+NGRAPH_RTTI_DEFINITION(op::v8::MatrixNms, "MatrixNms", 8, op::util::NmsBase);
+
+op::v8::MatrixNms::MatrixNms()
+    : NmsBase(m_attrs.output_type, m_attrs.nms_top_k, m_attrs.keep_top_k)
+{
+}
+
+op::v8::MatrixNms::MatrixNms(const Output<Node>& boxes,
+                             const Output<Node>& scores,
+                             const Attributes& attrs)
+    : NmsBase(boxes, scores, m_attrs.output_type, m_attrs.nms_top_k, m_attrs.keep_top_k)
+    , m_attrs{attrs}
+{
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> op::v8::MatrixNms::clone_with_new_inputs(const OutputVector& new_args) const
+{
+    NGRAPH_OP_SCOPE(v8_MatrixNms_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    NODE_VALIDATION_CHECK(this, new_args.size() == 2, "Number of inputs must be 2");
+
+    return std::make_shared<op::v8::MatrixNms>(new_args.at(0), new_args.at(1), m_attrs);
+}
+
+void op::v8::MatrixNms::validate()
+{
+    NGRAPH_OP_SCOPE(v8_MatrixNms_validate);
+    NmsBase::validate();
+
+    NODE_VALIDATION_CHECK(this,
+                          m_attrs.background_class >= -1,
+                          "The 'background_class' must be great or equal -1. Got:",
+                          m_attrs.background_class);
+}
+
+bool ngraph::op::v8::MatrixNms::visit_attributes(AttributeVisitor& visitor)
+{
+    NGRAPH_OP_SCOPE(v8_MatrixNms_visit_attributes);
+
+    visitor.on_attribute("sort_result_type", m_attrs.sort_result_type);
+    visitor.on_attribute("output_type", m_attrs.output_type);
+    visitor.on_attribute("nms_top_k", m_attrs.nms_top_k);
+    visitor.on_attribute("keep_top_k", m_attrs.keep_top_k);
+    visitor.on_attribute("sort_result_across_batch", m_attrs.sort_result_across_batch);
+    visitor.on_attribute("score_threshold", m_attrs.score_threshold);
+    visitor.on_attribute("background_class", m_attrs.background_class);
+    visitor.on_attribute("decay_function", m_attrs.decay_function);
+    visitor.on_attribute("gaussian_sigma", m_attrs.gaussian_sigma);
+    visitor.on_attribute("post_threshold", m_attrs.post_threshold);
+    visitor.on_attribute("normalized", m_attrs.normalized);
+
+    return true;
+}
+
+namespace ngraph
+{
+    template <>
+    EnumNames<op::v8::MatrixNms::DecayFunction>& EnumNames<op::v8::MatrixNms::DecayFunction>::get()
+    {
+        static auto enum_names = EnumNames<op::v8::MatrixNms::DecayFunction>(
+            "op::v8::MatrixNms::DecayFunction",
+            {{"gaussian", op::v8::MatrixNms::DecayFunction::GAUSSIAN},
+             {"linear", op::v8::MatrixNms::DecayFunction::LINEAR}});
+        return enum_names;
+    }
+
+    constexpr DiscreteTypeInfo AttributeAdapter<op::v8::MatrixNms::DecayFunction>::type_info;
+
+    std::ostream& operator<<(std::ostream& s, const op::v8::MatrixNms::DecayFunction& type)
+    {
+        return s << as_string(type);
+    }
+} // namespace ngraph
diff --git a/ngraph/core/src/op/multiclass_nms.cpp b/ngraph/core/src/op/multiclass_nms.cpp
new file mode 100644
index 00000000000..8f0e7cd5345
--- /dev/null
+++ b/ngraph/core/src/op/multiclass_nms.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/multiclass_nms.hpp"
+#include <cstring>
+#include <ngraph/validation_util.hpp>
+#include "itt.hpp"
+#include "ngraph/attribute_visitor.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/util/op_types.hpp"
+#include "ngraph/runtime/reference/multiclass_nms.hpp"
+#include "ngraph/type/bfloat16.hpp"
+#include "ngraph/type/float16.hpp"
+#include "ngraph/util.hpp"
+
+using namespace ngraph;
+
+NGRAPH_RTTI_DEFINITION(op::v8::MulticlassNms, "MulticlassNms", 8, op::util::NmsBase);
+
+op::v8::MulticlassNms::MulticlassNms()
+    : NmsBase(m_attrs.output_type, m_attrs.nms_top_k, m_attrs.keep_top_k)
+{
+}
+
+op::v8::MulticlassNms::MulticlassNms(const Output<Node>& boxes,
+                                     const Output<Node>& scores,
+                                     const Attributes& attrs)
+    : NmsBase(boxes, scores, m_attrs.output_type, m_attrs.nms_top_k, m_attrs.keep_top_k)
+    , m_attrs{attrs}
+{
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node>
+    op::v8::MulticlassNms::clone_with_new_inputs(const OutputVector& new_args) const
+{
+    NGRAPH_OP_SCOPE(v8_MulticlassNms_clone_with_new_inputs);
+    check_new_args_count(this, new_args);
+    NODE_VALIDATION_CHECK(this, new_args.size() == 2, "Number of inputs must be 2");
+
+    return std::make_shared<op::v8::MulticlassNms>(new_args.at(0), new_args.at(1), m_attrs);
+}
+
+void op::v8::MulticlassNms::validate()
+{
+    NGRAPH_OP_SCOPE(v8_MulticlassNms_validate);
+    NmsBase::validate();
+
+    NODE_VALIDATION_CHECK(this,
+                          m_attrs.background_class >= -1,
+                          "The 'background_class' must be great or equal -1. Got:",
+                          m_attrs.background_class);
+
+    NODE_VALIDATION_CHECK(this,
+                          m_attrs.nms_eta >= 0.0f && m_attrs.nms_eta <= 1.0f,
+                          "The 'nms_eta' must be in close range [0, 1.0]. Got:",
+                          m_attrs.nms_eta);
+}
+
+bool ngraph::op::v8::MulticlassNms::visit_attributes(AttributeVisitor& visitor)
+{
+    NGRAPH_OP_SCOPE(v8_MulticlassNms_visit_attributes);
+
+    visitor.on_attribute("sort_result_type", m_attrs.sort_result_type);
+    visitor.on_attribute("output_type", m_attrs.output_type);
+    visitor.on_attribute("nms_top_k", m_attrs.nms_top_k);
+    visitor.on_attribute("keep_top_k", m_attrs.keep_top_k);
+    visitor.on_attribute("sort_result_across_batch", m_attrs.sort_result_across_batch);
+    visitor.on_attribute("iou_threshold", m_attrs.iou_threshold);
+    visitor.on_attribute("score_threshold", m_attrs.score_threshold);
+    visitor.on_attribute("background_class", m_attrs.background_class);
+    visitor.on_attribute("nms_eta", m_attrs.nms_eta);
+    visitor.on_attribute("normalized", m_attrs.normalized);
+
+    return true;
+}
diff --git a/ngraph/core/src/op/prior_box.cpp b/ngraph/core/src/op/prior_box.cpp
index 3af4618296f..1b27ae401e4 100644
--- a/ngraph/core/src/op/prior_box.cpp
+++ b/ngraph/core/src/op/prior_box.cpp
@@ -14,7 +14,7 @@
 using namespace std;
 using namespace ngraph;
 
-NGRAPH_RTTI_DEFINITION(op::PriorBox, "PriorBox", 0);
+NGRAPH_RTTI_DEFINITION(op::v0::PriorBox, "PriorBox", 0);
 
 op::PriorBox::PriorBox(const Output<Node>& layer_shape,
                        const Output<Node>& image_shape,
@@ -69,7 +69,7 @@ void op::PriorBox::validate_and_infer_types()
     }
     else
     {
-        set_output_type(0, element::f32, PartialShape::dynamic());
+        set_output_type(0, element::f32, PartialShape{2, Dimension::dynamic()});
     }
 }
 
diff --git a/ngraph/core/src/op/shape_of.cpp b/ngraph/core/src/op/shape_of.cpp
index d929eb0c1c0..792bad19bae 100644
--- a/ngraph/core/src/op/shape_of.cpp
+++ b/ngraph/core/src/op/shape_of.cpp
@@ -51,7 +51,6 @@ shared_ptr<Node> op::v3::ShapeOf::clone_with_new_inputs(const OutputVector& new_
     NGRAPH_OP_SCOPE(v3_ShapeOf_clone_with_new_inputs);
     check_new_args_count(this, new_args);
     auto new_shape_of = make_shared<op::v3::ShapeOf>(new_args.at(0), m_output_type);
-    new_shape_of->set_is_foldable(m_is_foldable);
     return new_shape_of;
 }
 
@@ -82,8 +81,7 @@ namespace shape_of
 
     bool constant_fold_shape_of(Node* shape_of_node,
                                 Output<Node>& replacement,
-                                const Output<Node>& shape_of_input,
-                                bool is_foldable)
+                                const Output<Node>& shape_of_input)
     {
         auto partial_shape = shape_of_input.get_partial_shape();
         auto output_type = shape_of_node->get_output_element_type(0);
@@ -100,46 +98,6 @@ namespace shape_of
             }
             return false;
         }
-        else if (partial_shape.rank().is_static() && is_foldable)
-        {
-            auto shape_of = shape_of_node->copy_with_new_inputs({shape_of_input});
-            // Ugly
-            if (auto ps = as_type_ptr<op::v0::ShapeOf>(shape_of))
-            {
-                ps->set_is_foldable(false);
-            }
-            else if (auto ps = as_type_ptr<op::v3::ShapeOf>(shape_of))
-            {
-                ps->set_is_foldable(false);
-            }
-            auto dimensions = OutputVector{};
-            auto output_dimensions = vector<Dimension>(partial_shape);
-            for (size_t i = 0; i < output_dimensions.size(); ++i)
-            {
-                if (output_dimensions[i].is_static())
-                {
-                    auto temp = std::make_shared<op::v0::Constant>(
-                        output_type,
-                        Shape{1},
-                        std::vector<int64_t>{output_dimensions[i].get_length()});
-                    temp->set_friendly_name("ConstDim/" + temp->get_name());
-                    dimensions.emplace_back(temp);
-                }
-                else
-                {
-                    auto index = std::make_shared<op::v0::Constant>(
-                        output_type, Shape{1}, std::vector<int64_t>{static_cast<int64_t>(i)});
-                    auto axis = std::make_shared<op::v0::Constant>(
-                        element::i64, Shape{}, std::vector<int64_t>{0});
-                    auto temp = make_shared<op::v1::Gather>(shape_of, index, axis);
-                    temp->set_friendly_name("DynDim/" + temp->get_name());
-                    dimensions.emplace_back(temp);
-                }
-            }
-
-            replacement = std::make_shared<op::Concat>(dimensions, 0);
-            return true;
-        }
         return false;
     }
 
@@ -250,7 +208,7 @@ bool op::v3::ShapeOf::constant_fold(OutputVector& output_values, const OutputVec
     OV_ITT_SCOPED_TASK(itt::domains::nGraph, "op::v3::ShapeOf::constant_fold");
     if (get_rt_info().count("DISABLED_CONSTANT_FOLDING"))
         return false;
-    return shape_of::constant_fold_shape_of(this, output_values[0], input_values[0], m_is_foldable);
+    return shape_of::constant_fold_shape_of(this, output_values[0], input_values[0]);
 }
 
 // op::v0::ShapeOf
@@ -286,7 +244,6 @@ shared_ptr<Node> op::v0::ShapeOf::clone_with_new_inputs(const OutputVector& new_
                  description(),
                  " operation with name ",
                  get_friendly_name());
-    new_shape_of->set_is_foldable(m_is_foldable);
     return new_shape_of;
 }
 
@@ -318,7 +275,7 @@ bool op::v0::ShapeOf::constant_fold(OutputVector& output_values, const OutputVec
     OV_ITT_SCOPED_TASK(itt::domains::nGraph, "op::v0::ShapeOf::constant_fold");
     if (get_rt_info().count("DISABLED_CONSTANT_FOLDING"))
         return false;
-    return shape_of::constant_fold_shape_of(this, output_values[0], input_values[0], m_is_foldable);
+    return shape_of::constant_fold_shape_of(this, output_values[0], input_values[0]);
 }
 
 bool op::v0::ShapeOf::evaluate_lower(const HostTensorVector& output_values) const
diff --git a/ngraph/core/src/op/sinh.cpp b/ngraph/core/src/op/sinh.cpp
index d19d6cbe493..4cbe5bd13ab 100644
--- a/ngraph/core/src/op/sinh.cpp
+++ b/ngraph/core/src/op/sinh.cpp
@@ -2,19 +2,16 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "itt.hpp"
-
-#include "ngraph/op/cosh.hpp"
-#include "ngraph/op/multiply.hpp"
 #include "ngraph/op/sinh.hpp"
-
-#include "ngraph/runtime/host_tensor.hpp"
+#include <ngraph/validation_util.hpp>
 #include "ngraph/runtime/reference/sinh.hpp"
 
+#include "itt.hpp"
+
 using namespace std;
 using namespace ngraph;
 
-constexpr NodeTypeInfo op::Sinh::type_info;
+NGRAPH_RTTI_DEFINITION(op::v0::Sinh, "Sinh", 0, util::UnaryElementwiseArithmetic);
 
 op::Sinh::Sinh(const Output<Node>& arg)
     : UnaryElementwiseArithmetic(arg)
@@ -68,6 +65,7 @@ namespace sinhop
 bool op::Sinh::evaluate(const HostTensorVector& outputs, const HostTensorVector& inputs) const
 {
     NGRAPH_OP_SCOPE(v0_Sinh_evaluate);
+    NGRAPH_CHECK(validate_host_tensor_vector(outputs, 1) && validate_host_tensor_vector(inputs, 1));
     return sinhop::evaluate_sinh(inputs[0], outputs[0], shape_size(get_output_shape(0)));
 }
 
diff --git a/ngraph/core/src/op/space_to_depth.cpp b/ngraph/core/src/op/space_to_depth.cpp
index a31af19558f..b886c1beff3 100644
--- a/ngraph/core/src/op/space_to_depth.cpp
+++ b/ngraph/core/src/op/space_to_depth.cpp
@@ -13,9 +13,8 @@
 #include "ngraph/op/space_to_depth.hpp"
 #include "ngraph/shape.hpp"
 
-#include "ngraph/runtime/opt_kernel/reshape.hpp"
+#include "ngraph/runtime/reference/space_to_depth.hpp"
 
-using namespace std;
 using namespace ngraph;
 
 NGRAPH_RTTI_DEFINITION(op::SpaceToDepth, "SpaceToDepth", 0);
@@ -43,14 +42,14 @@ bool ngraph::op::v0::SpaceToDepth::visit_attributes(AttributeVisitor& visitor)
     return true;
 }
 
-shared_ptr<Node> op::SpaceToDepth::clone_with_new_inputs(const OutputVector& new_args) const
+std::shared_ptr<Node> op::SpaceToDepth::clone_with_new_inputs(const OutputVector& new_args) const
 {
     NGRAPH_OP_SCOPE(v0_SpaceToDepth_clone_with_new_inputs);
     if (new_args.size() != 1)
     {
         throw ngraph_error("Incorrect number of new arguments");
     }
-    return make_shared<SpaceToDepth>(new_args.at(0), m_mode, m_blocksize);
+    return std::make_shared<SpaceToDepth>(new_args.at(0), m_mode, m_blocksize);
 }
 
 void ngraph::op::v0::SpaceToDepth::validate_and_infer_types()
@@ -100,127 +99,34 @@ void ngraph::op::v0::SpaceToDepth::validate_and_infer_types()
     }
 }
 
-bool ngraph::op::v0::SpaceToDepth::evaluate_space_to_depth(const HostTensorVector& outputs,
-                                                           const HostTensorVector& inputs) const
+bool evaluate_space_to_depth(const HostTensorVector& outputs,
+                             const HostTensorVector& inputs,
+                             const std::size_t block_size,
+                             const op::SpaceToDepth::SpaceToDepthMode mode)
 {
-    const auto& data = inputs[0];
+    const auto& in = inputs[0];
     const auto& out = outputs[0];
-    size_t elem_size = data->get_element_type().size();
+    size_t elem_size = in->get_element_type().size();
 
-    if (data->get_partial_shape().is_dynamic())
+    if (in->get_partial_shape().is_dynamic())
     {
         return false;
     }
-    auto data_shape = data->get_shape();
-    const size_t n_dim = data_shape.at(0);
-    const size_t c_dim = data_shape.at(1);
-    const size_t spatial_dim_index = 2;
-    const size_t spatial_dims = data_shape.size() - spatial_dim_index;
 
-    for (size_t i = spatial_dim_index; i < data_shape.size(); ++i)
-    {
-        NODE_VALIDATION_CHECK(this,
-                              m_blocksize > 0 && data_shape.at(i) % m_blocksize == 0,
-                              "The dimension on position: ",
-                              i,
-                              " equal to: ",
-                              data_shape.at(i),
-                              " must be a multiple of m_blocksize: ",
-                              m_blocksize);
-    }
-
-    // First we have to disperse the data from spatial dimensions, then
-    // rearrange them so as appropriate chunks of data where close to their
-    // destination place. Finally squeeze data from respective dimensions.
-    Shape dispersed_shape{n_dim, c_dim};
-    for (size_t i = 0; i < spatial_dims; ++i)
-    {
-        dispersed_shape.push_back(data_shape.at(i + spatial_dim_index) / m_blocksize);
-        dispersed_shape.push_back(m_blocksize);
-    }
-    std::vector<size_t> plain_axes_order(data_shape.size());
-    std::iota(plain_axes_order.begin(), plain_axes_order.end(), 0);
-    std::vector<char> dispersed_data(shape_size(data_shape) * elem_size);
-    runtime::opt_kernel::reshape(data->get_data_ptr<char>(),
-                                 dispersed_data.data(),
-                                 data_shape,
-                                 plain_axes_order,
-                                 dispersed_shape,
-                                 elem_size);
-    // calculate axes to transpose
-    // [0, 3, 5, ..., spatial_dims + (spatial_dims + 1), 2, 4, ..., K + K])
-    vector<size_t> axes_order{0};
-    for (size_t i = 0, j = 3; i < spatial_dims; ++i, j += 2)
-    {
-        axes_order.push_back(j);
-    }
-    for (size_t i = 0, j = 2; i < spatial_dims; ++i, j += 2)
-    {
-        axes_order.push_back(j);
-    }
-
-    switch (m_mode)
-    {
-    // x' = reshape(data, [N, C, D1/block_size, block_size, D2/block_size, block_size, ...,
-    // DK/block_size, block_size])
-    // x'' = transpose(x', [0,  1, 3, 5, ..., K + (K + 1),  2, 4, ..., K + K])
-    // y = reshape(x'', [N, C * (block_size ^ K), D1 / block_size, D2 / block_size, ..., DK
-    // /
-    // block_size])
-    case SpaceToDepthMode::DEPTH_FIRST:
-    {
-        axes_order.insert(axes_order.begin() + 1, 1);
-        break;
-    }
-    // x' = reshape(data, [N, C, D1/block_size, block_size, D2/block_size, block_size, ... ,
-    // DK/block_size, block_size])
-    // x'' = transpose(x',  [0,  3, 5, ..., K + (K + 1), 1,  2, 4, ..., K + K])
-    // y = reshape(x'', [N, C * (block_size ^ K), D1 / block_size, D2 / block_size, ..., DK
-    // /
-    // block_size])
-    case SpaceToDepthMode::BLOCKS_FIRST:
-    default:
-    {
-        axes_order.insert(axes_order.begin() + spatial_dims + 1, 1);
-    }
-    }
-    std::vector<char> transposed_data(shape_size(data_shape) * elem_size);
-    Shape post_transpose_shape(axes_order.size());
-    for (size_t axis_idx = 0; axis_idx < axes_order.size(); ++axis_idx)
-    {
-        post_transpose_shape[axis_idx] = dispersed_shape[axes_order[axis_idx]];
-    }
-
-    runtime::opt_kernel::reshape(dispersed_data.data(),
-                                 transposed_data.data(),
-                                 dispersed_shape,
-                                 axes_order,
-                                 post_transpose_shape,
-                                 elem_size);
-
-    Shape squeezed_shape{n_dim};
-    for (size_t i = 0; i < spatial_dims; ++i)
-    {
-        squeezed_shape.push_back(data_shape.at(spatial_dim_index + i) / m_blocksize);
-    }
-    squeezed_shape.insert(squeezed_shape.begin() + 1, c_dim * std::pow(m_blocksize, spatial_dims));
-    for (size_t i = plain_axes_order.size() - 1; i < post_transpose_shape.size() - 1; ++i)
-    {
-        plain_axes_order.push_back(plain_axes_order[i] + 1);
-    }
-    runtime::opt_kernel::reshape(transposed_data.data(),
-                                 out->get_data_ptr<char>(),
-                                 post_transpose_shape,
-                                 plain_axes_order,
-                                 squeezed_shape,
-                                 elem_size);
+    runtime::reference::space_to_depth(in->get_data_ptr<char>(),
+                                       in->get_shape(),
+                                       out->get_data_ptr<char>(),
+                                       out->get_shape(),
+                                       block_size,
+                                       mode,
+                                       elem_size);
     return true;
 }
 bool ngraph::op::v0::SpaceToDepth::evaluate(const HostTensorVector& outputs,
                                             const HostTensorVector& inputs) const
 {
     NGRAPH_OP_SCOPE(v0_SpaceToDepth_evaluate);
-    return evaluate_space_to_depth(outputs, inputs);
+    return evaluate_space_to_depth(outputs, inputs, m_blocksize, m_mode);
 }
 
 bool ngraph::op::v0::SpaceToDepth::has_evaluate() const
diff --git a/ngraph/core/src/op/util/nms_base.cpp b/ngraph/core/src/op/util/nms_base.cpp
new file mode 100644
index 00000000000..4fce4c46fc4
--- /dev/null
+++ b/ngraph/core/src/op/util/nms_base.cpp
@@ -0,0 +1,183 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/util/nms_base.hpp"
+#include <cstring>
+#include <ngraph/validation_util.hpp>
+#include "itt.hpp"
+#include "ngraph/attribute_visitor.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/util/op_types.hpp"
+#include "ngraph/type/bfloat16.hpp"
+#include "ngraph/type/float16.hpp"
+#include "ngraph/util.hpp"
+
+using namespace ngraph;
+
+NGRAPH_RTTI_DEFINITION(op::util::NmsBase, "NmsBase", 0);
+
+op::util::NmsBase::NmsBase(ngraph::element::Type& output_type, int& nms_top_k, int& keep_top_k)
+    : m_output_type(output_type)
+    , m_nms_top_k(nms_top_k)
+    , m_keep_top_k(keep_top_k)
+{
+}
+
+op::util::NmsBase::NmsBase(const Output<Node>& boxes,
+                           const Output<Node>& scores,
+                           ngraph::element::Type& output_type,
+                           int& nms_top_k,
+                           int& keep_top_k)
+    : Op({boxes, scores})
+    , m_output_type(output_type)
+    , m_nms_top_k(nms_top_k)
+    , m_keep_top_k(keep_top_k)
+{
+}
+
+namespace
+{
+    inline bool is_float_type_admissible(const element::Type& t)
+    {
+        return t == element::f32 || t == element::f16 || t == element::bf16;
+    }
+} // namespace
+
+void op::util::NmsBase::validate()
+{
+    NGRAPH_OP_SCOPE(util_NmsBase_validate);
+
+    const auto boxes_ps = get_input_partial_shape(0);
+    const auto scores_ps = get_input_partial_shape(1);
+
+    NODE_VALIDATION_CHECK(this,
+                          m_output_type == element::i64 || m_output_type == element::i32,
+                          "Output type must be i32 or i64");
+
+    if (boxes_ps.is_dynamic() || scores_ps.is_dynamic())
+    {
+        return;
+    }
+
+    NODE_VALIDATION_CHECK(this,
+                          is_float_type_admissible(get_input_element_type(0)),
+                          "Expected bf16, fp16 or fp32 as element type for the 'boxes' input.");
+
+    NODE_VALIDATION_CHECK(this,
+                          is_float_type_admissible(get_input_element_type(1)),
+                          "Expected bf16, fp16 or fp32 as element type for the 'scores' input.");
+
+    NODE_VALIDATION_CHECK(this,
+                          boxes_ps.rank().is_static() && boxes_ps.rank().get_length() == 3,
+                          "Expected a 3D tensor for the 'boxes' input. Got: ",
+                          boxes_ps);
+
+    NODE_VALIDATION_CHECK(this,
+                          boxes_ps[2].is_static() && boxes_ps[2].get_length() == 4,
+                          "The third dimension of the 'boxes' must be 4. Got: ",
+                          boxes_ps[2]);
+
+    NODE_VALIDATION_CHECK(this,
+                          scores_ps.rank().is_static() && scores_ps.rank().get_length() == 3,
+                          "Expected a 3D tensor for the 'scores' input. Got: ",
+                          scores_ps);
+
+    NODE_VALIDATION_CHECK(
+        this, m_nms_top_k >= -1, "The 'nms_top_k' must be great or equal -1. Got:", m_nms_top_k);
+
+    NODE_VALIDATION_CHECK(
+        this, m_keep_top_k >= -1, "The 'keep_top_k' must be great or equal -1. Got:", m_keep_top_k);
+
+    const auto num_batches_boxes = boxes_ps[0];
+    const auto num_batches_scores = scores_ps[0];
+
+    NODE_VALIDATION_CHECK(this,
+                          num_batches_boxes.same_scheme(num_batches_scores),
+                          "The first dimension of both 'boxes' and 'scores' must match. Boxes: ",
+                          num_batches_boxes,
+                          "; Scores: ",
+                          num_batches_scores);
+
+    const auto num_boxes_boxes = boxes_ps[1];
+    const auto num_boxes_scores = scores_ps[2];
+    NODE_VALIDATION_CHECK(this,
+                          num_boxes_boxes.same_scheme(num_boxes_scores),
+                          "'boxes' and 'scores' input shapes must match at the second and third "
+                          "dimension respectively. Boxes: ",
+                          num_boxes_boxes,
+                          "; Scores: ",
+                          num_boxes_scores);
+}
+
+void op::util::NmsBase::validate_and_infer_types()
+{
+    NGRAPH_OP_SCOPE(util_NmsBase_validate_and_infer_types);
+    const auto boxes_ps = get_input_partial_shape(0);
+    const auto scores_ps = get_input_partial_shape(1);
+
+    auto first_dim_shape = Dimension::dynamic();
+
+    validate();
+
+    if (boxes_ps.rank().is_static() && scores_ps.rank().is_static())
+    {
+        const auto num_boxes_boxes = boxes_ps[1];
+        if (num_boxes_boxes.is_static() && scores_ps[0].is_static() && scores_ps[1].is_static())
+        {
+            const auto num_boxes = num_boxes_boxes.get_length();
+            const auto num_classes = scores_ps[1].get_length();
+            int64_t max_output_boxes_per_class = 0;
+            if (m_nms_top_k >= 0)
+                max_output_boxes_per_class = std::min(num_boxes, (int64_t)m_nms_top_k);
+            else
+                max_output_boxes_per_class = num_boxes;
+
+            auto max_output_boxes_per_batch = max_output_boxes_per_class * num_classes;
+            if (m_keep_top_k >= 0)
+                max_output_boxes_per_batch =
+                    std::min(max_output_boxes_per_batch, (int64_t)m_keep_top_k);
+
+            first_dim_shape = Dimension(0, max_output_boxes_per_batch * scores_ps[0].get_length());
+        }
+    }
+
+    // 'selected_outputs' have the following format:
+    //      [number of selected boxes, [class_id, box_score, xmin, ymin, xmax, ymax]]
+    set_output_type(0, element::f32, {first_dim_shape, 6});
+    // 'selected_indices' have the following format:
+    //      [number of selected boxes, ]
+    set_output_type(1, m_output_type, {first_dim_shape, 1});
+    // 'selected_num' have the following format:
+    //      [num_batches, ]
+    if (boxes_ps.rank().is_static() && boxes_ps.rank().get_length() > 0)
+    {
+        set_output_type(2, m_output_type, {boxes_ps[0]});
+    }
+    else
+    {
+        set_output_type(2, m_output_type, {Dimension::dynamic()});
+    }
+}
+
+namespace ngraph
+{
+    template <>
+    EnumNames<op::util::NmsBase::SortResultType>&
+        EnumNames<op::util::NmsBase::SortResultType>::get()
+    {
+        static auto enum_names = EnumNames<op::util::NmsBase::SortResultType>(
+            "op::util::NmsBase::SortResultType",
+            {{"classid", op::util::NmsBase::SortResultType::CLASSID},
+             {"score", op::util::NmsBase::SortResultType::SCORE},
+             {"none", op::util::NmsBase::SortResultType::NONE}});
+        return enum_names;
+    }
+
+    constexpr DiscreteTypeInfo AttributeAdapter<op::util::NmsBase::SortResultType>::type_info;
+
+    std::ostream& operator<<(std::ostream& s, const op::util::NmsBase::SortResultType& type)
+    {
+        return s << as_string(type);
+    }
+} // namespace ngraph
diff --git a/ngraph/core/src/opsets/opset.cpp b/ngraph/core/src/opsets/opset.cpp
index ea09eec98c1..c3b791995e3 100644
--- a/ngraph/core/src/opsets/opset.cpp
+++ b/ngraph/core/src/opsets/opset.cpp
@@ -115,3 +115,15 @@ const ngraph::OpSet& ngraph::get_opset7()
     });
     return opset;
 }
+
+const ngraph::OpSet& ngraph::get_opset8()
+{
+    static OpSet opset;
+    static std::once_flag flag;
+    std::call_once(flag, [&]() {
+#define NGRAPH_OP(NAME, NAMESPACE) opset.insert<NAMESPACE::NAME>();
+#include "ngraph/opsets/opset8_tbl.hpp"
+#undef NGRAPH_OP
+    });
+    return opset;
+}
diff --git a/ngraph/core/src/pass/visualize_tree.cpp b/ngraph/core/src/pass/visualize_tree.cpp
index dfed1d05640..b963c9fee98 100644
--- a/ngraph/core/src/pass/visualize_tree.cpp
+++ b/ngraph/core/src/pass/visualize_tree.cpp
@@ -16,6 +16,7 @@
 #include "ngraph/pass/pass.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "ngraph/util.hpp"
+#include "ngraph/variant.hpp"
 
 using namespace ngraph;
 using namespace std;
@@ -167,6 +168,24 @@ static std::string label_edge(const std::shared_ptr<Node>& /* src */,
     return ss.str();
 }
 
+static std::string
+    get_attribute_values(const std::map<std::string, std::shared_ptr<Variant>>& attributes,
+                         const std::string& delimiter = ", ")
+{
+    stringstream ss;
+    bool first = true;
+    for (const auto& item : attributes)
+    {
+        ss << (first ? " " : delimiter) << item.first;
+        const auto attributeValue = item.second == nullptr ? "[EMPTY]" : item.second->to_string();
+        if (!attributeValue.empty())
+            ss << "{" << attributeValue << "}";
+
+        first = false;
+    }
+    return ss.str();
+}
+
 NGRAPH_RTTI_DEFINITION(ngraph::pass::VisualizeTree, "ngraph::pass::VisualizeTree", 0);
 
 bool pass::VisualizeTree::run_on_function(std::shared_ptr<ngraph::Function> f)
@@ -481,6 +500,7 @@ string pass::VisualizeTree::get_attributes(shared_ptr<Node> node)
         static const bool nvtos = getenv_bool("NGRAPH_VISUALIZE_TREE_OUTPUT_SHAPES");
         static const bool nvtot = getenv_bool("NGRAPH_VISUALIZE_TREE_OUTPUT_TYPES");
         static const bool nvtio = getenv_bool("NGRAPH_VISUALIZE_TREE_IO");
+        static const bool nvtrti = getenv_bool("NGRAPH_VISUALIZE_TREE_RUNTIME_INFO");
 
         if (nvtos || nvtot || nvtio)
         {
@@ -495,6 +515,11 @@ string pass::VisualizeTree::get_attributes(shared_ptr<Node> node)
                         label << pretty_partial_shape(input.get_partial_shape());
                     label << ": " << node->get_input_node_ptr(input.get_index())->get_name()
                           << ": out" << input.get_source_output().get_index();
+
+                    if (nvtrti)
+                    {
+                        label << get_attribute_values(input.get_rt_info());
+                    }
                 }
             }
             for (const auto& output : node->outputs())
@@ -505,6 +530,11 @@ string pass::VisualizeTree::get_attributes(shared_ptr<Node> node)
                     label << "{" << output.get_element_type().get_type_name() << "}";
                 if (nvtos)
                     label << pretty_partial_shape(output.get_partial_shape());
+
+                if (nvtrti)
+                {
+                    label << get_attribute_values(output.get_rt_info());
+                }
             }
         }
 
@@ -544,11 +574,7 @@ string pass::VisualizeTree::get_node_name(shared_ptr<Node> node)
         const auto rt = node->get_rt_info();
         if (!rt.empty())
         {
-            rc += "\\nrt info: ";
-            for (const auto& item : rt)
-            {
-                rc += item.first + " ";
-            }
+            rc += "\\nrt info: " + get_attribute_values(rt, "\\n");
         }
     }
     return rc;
diff --git a/ngraph/frontend/CMakeLists.txt b/ngraph/frontend/CMakeLists.txt
index 3f37edd60b4..8a25cc9b650 100644
--- a/ngraph/frontend/CMakeLists.txt
+++ b/ngraph/frontend/CMakeLists.txt
@@ -3,7 +3,7 @@
 #
 
 if(NOT WIN32)
-    message(${CMAKE_CURRENT_SOURCE_DIR}/cmake_static_protobuf)
+    message(${CMAKE_SOURCE_DIR}/thirdparty/cmake_static_protobuf)
     message(BINARY ${CMAKE_CURRENT_BINARY_DIR})
 
     # There seems no suitable other way to identify exact output binary name for libprotobuf
@@ -21,7 +21,7 @@ if(NOT WIN32)
     add_custom_command(
             OUTPUT
             ${PROTOBUF_STATIC_LIB_OUTPUT}
-            COMMAND ${CMAKE_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/cmake_static_protobuf
+            COMMAND ${CMAKE_COMMAND} ${CMAKE_SOURCE_DIR}/thirdparty/cmake_static_protobuf
                     -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
                     -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}
                     -DCMAKE_COMPILE_PDB_OUTPUT_DIRECTORY=${CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY}
diff --git a/ngraph/frontend/onnx_common/include/onnx_common/utils.hpp b/ngraph/frontend/onnx_common/include/onnx_common/utils.hpp
index 499875cc26b..c16d11f2a65 100644
--- a/ngraph/frontend/onnx_common/include/onnx_common/utils.hpp
+++ b/ngraph/frontend/onnx_common/include/onnx_common/utils.hpp
@@ -5,7 +5,7 @@
 
 namespace ONNX_NAMESPACE
 {
-    enum TensorProto_DataType;
+    enum TensorProto_DataType : int;
 }
 
 namespace ngraph
diff --git a/ngraph/frontend/paddlepaddle/CMakeLists.txt b/ngraph/frontend/paddlepaddle/CMakeLists.txt
index bb08e116de6..bf5c396ac6c 100644
--- a/ngraph/frontend/paddlepaddle/CMakeLists.txt
+++ b/ngraph/frontend/paddlepaddle/CMakeLists.txt
@@ -64,10 +64,6 @@ if(NOT WIN32)
     add_dependencies(${TARGET_NAME} libprotobuf_static)
 endif()
 
-if (TARGET ext_protobuf)
-    add_dependencies(${TARGET_NAME} ext_protobuf)
-endif()
-
 target_include_directories(${TARGET_NAME}
         PUBLIC
             $<BUILD_INTERFACE:${${TARGET_NAME}_INCLUDE_DIR}>
diff --git a/ngraph/python/BUILDING.md b/ngraph/python/BUILDING.md
index 5d4d9c72892..18e395fc9d8 100644
--- a/ngraph/python/BUILDING.md
+++ b/ngraph/python/BUILDING.md
@@ -48,7 +48,6 @@ set the mentioned flags to `ON`. Note the `CMAKE_INSTALL_PREFIX`, which defaults
         -DENABLE_OPENCV=OFF \
         -DENABLE_VPU=OFF \
         -DENABLE_PYTHON=ON \
-        -DNGRAPH_PYTHON_BUILD_ENABLE=ON \
         -DNGRAPH_ONNX_IMPORT_ENABLE=ON \
         -DCMAKE_INSTALL_PREFIX="${OPENVINO_BASEDIR}/openvino_dist"
     
@@ -111,7 +110,6 @@ cmake .. ^
     -DENABLE_CLDNN=OFF ^
     -DENABLE_OPENCV=OFF ^
     -DENABLE_VPU=OFF ^
-    -DNGRAPH_PYTHON_BUILD_ENABLE=ON ^
     -DNGRAPH_ONNX_IMPORT_ENABLE=ON ^
     -DENABLE_PYTHON=ON ^
     -DCMAKE_CXX_COMPILER="C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx64\x64"
diff --git a/ngraph/python/CMakeLists.txt b/ngraph/python/CMakeLists.txt
index 3231b4cacda..82b414d0499 100644
--- a/ngraph/python/CMakeLists.txt
+++ b/ngraph/python/CMakeLists.txt
@@ -15,19 +15,7 @@ if(ngraph_FOUND)
     message("ngraph version = {${ngraph_VERSION}}")
 endif()
 
-include(FetchContent)
-
-FetchContent_Declare(
-    pybind11
-    GIT_REPOSITORY "https://github.com/pybind/pybind11.git"
-    GIT_TAG        "v2.6.2"
-)
-
-FetchContent_GetProperties(pybind11)
-if(NOT pybind11_POPULATED)
-    FetchContent_Populate(pybind11)
-    add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
-endif()
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/pybind11 EXCLUDE_FROM_ALL)
 
 # PYTHON_VERSION_MAJOR and PYTHON_VERSION_MINOR are defined inside pybind11
 set(PYTHON_VERSION python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR})
diff --git a/ngraph/python/pybind11 b/ngraph/python/pybind11
new file mode 160000
index 00000000000..8de7772cc72
--- /dev/null
+++ b/ngraph/python/pybind11
@@ -0,0 +1 @@
+Subproject commit 8de7772cc72daca8e947b79b83fea46214931604
diff --git a/ngraph/python/setup.py b/ngraph/python/setup.py
index e90cc66d6d1..059ccd81acb 100644
--- a/ngraph/python/setup.py
+++ b/ngraph/python/setup.py
@@ -155,7 +155,7 @@ class BuildCMakeExt(build_ext):
         ext_args = self.cmake_args.split() if self.cmake_args else []
         self.spawn(["cmake", "-H" + root_dir, "-B" + self.build_temp,
                     "-DCMAKE_BUILD_TYPE={}".format(self.config),
-                    "-DNGRAPH_PYTHON_BUILD_ENABLE=ON",
+                    "-DENABLE_PYTHON=ON",
                     "-DNGRAPH_ONNX_IMPORT_ENABLE=ON"] + ext_args)
 
         self.announce("Building binaries", level=3)
diff --git a/ngraph/python/src/ngraph/__init__.py b/ngraph/python/src/ngraph/__init__.py
index f51c5cea130..f7e498c4a60 100644
--- a/ngraph/python/src/ngraph/__init__.py
+++ b/ngraph/python/src/ngraph/__init__.py
@@ -28,159 +28,159 @@ from ngraph.frontend import OpValidationFailure
 from ngraph.frontend import Place
 from ngraph.helpers import function_from_cnn
 from ngraph.helpers import function_to_cnn
-from ngraph.opset7 import absolute
-from ngraph.opset7 import absolute as abs
-from ngraph.opset7 import acos
-from ngraph.opset7 import acosh
-from ngraph.opset7 import add
-from ngraph.opset7 import asin
-from ngraph.opset7 import asinh
-from ngraph.opset7 import assign
-from ngraph.opset7 import atan
-from ngraph.opset7 import atanh
-from ngraph.opset7 import avg_pool
-from ngraph.opset7 import batch_norm_inference
-from ngraph.opset7 import batch_to_space
-from ngraph.opset7 import binary_convolution
-from ngraph.opset7 import broadcast
-from ngraph.opset7 import bucketize
-from ngraph.opset7 import ceiling
-from ngraph.opset7 import ceiling as ceil
-from ngraph.opset7 import clamp
-from ngraph.opset7 import concat
-from ngraph.opset7 import constant
-from ngraph.opset7 import convert
-from ngraph.opset7 import convert_like
-from ngraph.opset7 import convolution
-from ngraph.opset7 import convolution_backprop_data
-from ngraph.opset7 import cos
-from ngraph.opset7 import cosh
-from ngraph.opset7 import ctc_greedy_decoder
-from ngraph.opset7 import ctc_greedy_decoder_seq_len
-from ngraph.opset7 import ctc_loss
-from ngraph.opset7 import cum_sum
-from ngraph.opset7 import cum_sum as cumsum
-from ngraph.opset7 import deformable_convolution
-from ngraph.opset7 import deformable_psroi_pooling
-from ngraph.opset7 import depth_to_space
-from ngraph.opset7 import detection_output
-from ngraph.opset7 import dft
-from ngraph.opset7 import divide
-from ngraph.opset7 import einsum
-from ngraph.opset7 import elu
-from ngraph.opset7 import embedding_bag_offsets_sum
-from ngraph.opset7 import embedding_bag_packed_sum
-from ngraph.opset7 import embedding_segments_sum
-from ngraph.opset7 import extract_image_patches
-from ngraph.opset7 import equal
-from ngraph.opset7 import erf
-from ngraph.opset7 import exp
-from ngraph.opset7 import fake_quantize
-from ngraph.opset7 import floor
-from ngraph.opset7 import floor_mod
-from ngraph.opset7 import gather
-from ngraph.opset7 import gather_elements
-from ngraph.opset7 import gather_nd
-from ngraph.opset7 import gather_tree
-from ngraph.opset7 import gelu
-from ngraph.opset7 import greater
-from ngraph.opset7 import greater_equal
-from ngraph.opset7 import grn
-from ngraph.opset7 import group_convolution
-from ngraph.opset7 import group_convolution_backprop_data
-from ngraph.opset7 import gru_cell
-from ngraph.opset7 import gru_sequence
-from ngraph.opset7 import hard_sigmoid
-from ngraph.opset7 import hsigmoid
-from ngraph.opset7 import hswish
-from ngraph.opset7 import idft
-from ngraph.opset7 import interpolate
-from ngraph.opset7 import less
-from ngraph.opset7 import less_equal
-from ngraph.opset7 import log
-from ngraph.opset7 import logical_and
-from ngraph.opset7 import logical_not
-from ngraph.opset7 import logical_or
-from ngraph.opset7 import logical_xor
-from ngraph.opset7 import log_softmax
-from ngraph.opset7 import loop
-from ngraph.opset7 import lrn
-from ngraph.opset7 import lstm_cell
-from ngraph.opset7 import lstm_sequence
-from ngraph.opset7 import matmul
-from ngraph.opset7 import max_pool
-from ngraph.opset7 import maximum
-from ngraph.opset7 import minimum
-from ngraph.opset7 import mish
-from ngraph.opset7 import mod
-from ngraph.opset7 import multiply
-from ngraph.opset7 import mvn
-from ngraph.opset7 import negative
-from ngraph.opset7 import non_max_suppression
-from ngraph.opset7 import non_zero
-from ngraph.opset7 import normalize_l2
-from ngraph.opset7 import not_equal
-from ngraph.opset7 import one_hot
-from ngraph.opset7 import pad
-from ngraph.opset7 import parameter
-from ngraph.opset7 import power
-from ngraph.opset7 import prelu
-from ngraph.opset7 import prior_box
-from ngraph.opset7 import prior_box_clustered
-from ngraph.opset7 import psroi_pooling
-from ngraph.opset7 import proposal
-from ngraph.opset7 import range
-from ngraph.opset7 import read_value
-from ngraph.opset7 import reduce_l1
-from ngraph.opset7 import reduce_l2
-from ngraph.opset7 import reduce_logical_and
-from ngraph.opset7 import reduce_logical_or
-from ngraph.opset7 import reduce_max
-from ngraph.opset7 import reduce_mean
-from ngraph.opset7 import reduce_min
-from ngraph.opset7 import reduce_prod
-from ngraph.opset7 import reduce_sum
-from ngraph.opset7 import region_yolo
-from ngraph.opset7 import reorg_yolo
-from ngraph.opset7 import relu
-from ngraph.opset7 import reshape
-from ngraph.opset7 import result
-from ngraph.opset7 import reverse_sequence
-from ngraph.opset7 import rnn_cell
-from ngraph.opset7 import rnn_sequence
-from ngraph.opset7 import roi_align
-from ngraph.opset7 import roi_pooling
-from ngraph.opset7 import roll
-from ngraph.opset7 import round
-from ngraph.opset7 import scatter_elements_update
-from ngraph.opset7 import scatter_update
-from ngraph.opset7 import select
-from ngraph.opset7 import selu
-from ngraph.opset7 import shape_of
-from ngraph.opset7 import shuffle_channels
-from ngraph.opset7 import sigmoid
-from ngraph.opset7 import sign
-from ngraph.opset7 import sin
-from ngraph.opset7 import sinh
-from ngraph.opset7 import softmax
-from ngraph.opset7 import softplus
-from ngraph.opset7 import space_to_batch
-from ngraph.opset7 import space_to_depth
-from ngraph.opset7 import split
-from ngraph.opset7 import sqrt
-from ngraph.opset7 import squared_difference
-from ngraph.opset7 import squeeze
-from ngraph.opset7 import strided_slice
-from ngraph.opset7 import subtract
-from ngraph.opset7 import swish
-from ngraph.opset7 import tan
-from ngraph.opset7 import tanh
-from ngraph.opset7 import tensor_iterator
-from ngraph.opset7 import tile
-from ngraph.opset7 import topk
-from ngraph.opset7 import transpose
-from ngraph.opset7 import unsqueeze
-from ngraph.opset7 import variadic_split
+from ngraph.opset8 import absolute
+from ngraph.opset8 import absolute as abs
+from ngraph.opset8 import acos
+from ngraph.opset8 import acosh
+from ngraph.opset8 import add
+from ngraph.opset8 import asin
+from ngraph.opset8 import asinh
+from ngraph.opset8 import assign
+from ngraph.opset8 import atan
+from ngraph.opset8 import atanh
+from ngraph.opset8 import avg_pool
+from ngraph.opset8 import batch_norm_inference
+from ngraph.opset8 import batch_to_space
+from ngraph.opset8 import binary_convolution
+from ngraph.opset8 import broadcast
+from ngraph.opset8 import bucketize
+from ngraph.opset8 import ceiling
+from ngraph.opset8 import ceiling as ceil
+from ngraph.opset8 import clamp
+from ngraph.opset8 import concat
+from ngraph.opset8 import constant
+from ngraph.opset8 import convert
+from ngraph.opset8 import convert_like
+from ngraph.opset8 import convolution
+from ngraph.opset8 import convolution_backprop_data
+from ngraph.opset8 import cos
+from ngraph.opset8 import cosh
+from ngraph.opset8 import ctc_greedy_decoder
+from ngraph.opset8 import ctc_greedy_decoder_seq_len
+from ngraph.opset8 import ctc_loss
+from ngraph.opset8 import cum_sum
+from ngraph.opset8 import cum_sum as cumsum
+from ngraph.opset8 import deformable_convolution
+from ngraph.opset8 import deformable_psroi_pooling
+from ngraph.opset8 import depth_to_space
+from ngraph.opset8 import detection_output
+from ngraph.opset8 import dft
+from ngraph.opset8 import divide
+from ngraph.opset8 import einsum
+from ngraph.opset8 import elu
+from ngraph.opset8 import embedding_bag_offsets_sum
+from ngraph.opset8 import embedding_bag_packed_sum
+from ngraph.opset8 import embedding_segments_sum
+from ngraph.opset8 import extract_image_patches
+from ngraph.opset8 import equal
+from ngraph.opset8 import erf
+from ngraph.opset8 import exp
+from ngraph.opset8 import fake_quantize
+from ngraph.opset8 import floor
+from ngraph.opset8 import floor_mod
+from ngraph.opset8 import gather
+from ngraph.opset8 import gather_elements
+from ngraph.opset8 import gather_nd
+from ngraph.opset8 import gather_tree
+from ngraph.opset8 import gelu
+from ngraph.opset8 import greater
+from ngraph.opset8 import greater_equal
+from ngraph.opset8 import grn
+from ngraph.opset8 import group_convolution
+from ngraph.opset8 import group_convolution_backprop_data
+from ngraph.opset8 import gru_cell
+from ngraph.opset8 import gru_sequence
+from ngraph.opset8 import hard_sigmoid
+from ngraph.opset8 import hsigmoid
+from ngraph.opset8 import hswish
+from ngraph.opset8 import idft
+from ngraph.opset8 import interpolate
+from ngraph.opset8 import less
+from ngraph.opset8 import less_equal
+from ngraph.opset8 import log
+from ngraph.opset8 import logical_and
+from ngraph.opset8 import logical_not
+from ngraph.opset8 import logical_or
+from ngraph.opset8 import logical_xor
+from ngraph.opset8 import log_softmax
+from ngraph.opset8 import loop
+from ngraph.opset8 import lrn
+from ngraph.opset8 import lstm_cell
+from ngraph.opset8 import lstm_sequence
+from ngraph.opset8 import matmul
+from ngraph.opset8 import max_pool
+from ngraph.opset8 import maximum
+from ngraph.opset8 import minimum
+from ngraph.opset8 import mish
+from ngraph.opset8 import mod
+from ngraph.opset8 import multiply
+from ngraph.opset8 import mvn
+from ngraph.opset8 import negative
+from ngraph.opset8 import non_max_suppression
+from ngraph.opset8 import non_zero
+from ngraph.opset8 import normalize_l2
+from ngraph.opset8 import not_equal
+from ngraph.opset8 import one_hot
+from ngraph.opset8 import pad
+from ngraph.opset8 import parameter
+from ngraph.opset8 import power
+from ngraph.opset8 import prelu
+from ngraph.opset8 import prior_box
+from ngraph.opset8 import prior_box_clustered
+from ngraph.opset8 import psroi_pooling
+from ngraph.opset8 import proposal
+from ngraph.opset8 import range
+from ngraph.opset8 import read_value
+from ngraph.opset8 import reduce_l1
+from ngraph.opset8 import reduce_l2
+from ngraph.opset8 import reduce_logical_and
+from ngraph.opset8 import reduce_logical_or
+from ngraph.opset8 import reduce_max
+from ngraph.opset8 import reduce_mean
+from ngraph.opset8 import reduce_min
+from ngraph.opset8 import reduce_prod
+from ngraph.opset8 import reduce_sum
+from ngraph.opset8 import region_yolo
+from ngraph.opset8 import reorg_yolo
+from ngraph.opset8 import relu
+from ngraph.opset8 import reshape
+from ngraph.opset8 import result
+from ngraph.opset8 import reverse_sequence
+from ngraph.opset8 import rnn_cell
+from ngraph.opset8 import rnn_sequence
+from ngraph.opset8 import roi_align
+from ngraph.opset8 import roi_pooling
+from ngraph.opset8 import roll
+from ngraph.opset8 import round
+from ngraph.opset8 import scatter_elements_update
+from ngraph.opset8 import scatter_update
+from ngraph.opset8 import select
+from ngraph.opset8 import selu
+from ngraph.opset8 import shape_of
+from ngraph.opset8 import shuffle_channels
+from ngraph.opset8 import sigmoid
+from ngraph.opset8 import sign
+from ngraph.opset8 import sin
+from ngraph.opset8 import sinh
+from ngraph.opset8 import softmax
+from ngraph.opset8 import softplus
+from ngraph.opset8 import space_to_batch
+from ngraph.opset8 import space_to_depth
+from ngraph.opset8 import split
+from ngraph.opset8 import sqrt
+from ngraph.opset8 import squared_difference
+from ngraph.opset8 import squeeze
+from ngraph.opset8 import strided_slice
+from ngraph.opset8 import subtract
+from ngraph.opset8 import swish
+from ngraph.opset8 import tan
+from ngraph.opset8 import tanh
+from ngraph.opset8 import tensor_iterator
+from ngraph.opset8 import tile
+from ngraph.opset8 import topk
+from ngraph.opset8 import transpose
+from ngraph.opset8 import unsqueeze
+from ngraph.opset8 import variadic_split
 
 
 # Extend Node class to support binary operators
diff --git a/ngraph/python/src/ngraph/opset8/__init__.py b/ngraph/python/src/ngraph/opset8/__init__.py
new file mode 100644
index 00000000000..08a24529d41
--- /dev/null
+++ b/ngraph/python/src/ngraph/opset8/__init__.py
@@ -0,0 +1,156 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from ngraph.opset1.ops import absolute
+from ngraph.opset1.ops import absolute as abs
+from ngraph.opset1.ops import acos
+from ngraph.opset4.ops import acosh
+from ngraph.opset1.ops import add
+from ngraph.opset1.ops import asin
+from ngraph.opset4.ops import asinh
+from ngraph.opset3.ops import assign
+from ngraph.opset1.ops import atan
+from ngraph.opset4.ops import atanh
+from ngraph.opset1.ops import avg_pool
+from ngraph.opset5.ops import batch_norm_inference
+from ngraph.opset2.ops import batch_to_space
+from ngraph.opset1.ops import binary_convolution
+from ngraph.opset3.ops import broadcast
+from ngraph.opset3.ops import bucketize
+from ngraph.opset1.ops import ceiling
+from ngraph.opset1.ops import ceiling as ceil
+from ngraph.opset1.ops import clamp
+from ngraph.opset1.ops import concat
+from ngraph.opset1.ops import constant
+from ngraph.opset1.ops import convert
+from ngraph.opset1.ops import convert_like
+from ngraph.opset1.ops import convolution
+from ngraph.opset1.ops import convolution_backprop_data
+from ngraph.opset1.ops import cos
+from ngraph.opset1.ops import cosh
+from ngraph.opset1.ops import ctc_greedy_decoder
+from ngraph.opset6.ops import ctc_greedy_decoder_seq_len
+from ngraph.opset4.ops import ctc_loss
+from ngraph.opset3.ops import cum_sum
+from ngraph.opset3.ops import cum_sum as cumsum
+from ngraph.opset1.ops import deformable_convolution
+from ngraph.opset1.ops import deformable_psroi_pooling
+from ngraph.opset1.ops import depth_to_space
+from ngraph.opset1.ops import detection_output
+from ngraph.opset7.ops import dft
+from ngraph.opset1.ops import divide
+from ngraph.opset7.ops import einsum
+from ngraph.opset1.ops import elu
+from ngraph.opset3.ops import embedding_bag_offsets_sum
+from ngraph.opset3.ops import embedding_bag_packed_sum
+from ngraph.opset3.ops import embedding_segments_sum
+from ngraph.opset3.ops import extract_image_patches
+from ngraph.opset1.ops import equal
+from ngraph.opset1.ops import erf
+from ngraph.opset1.ops import exp
+from ngraph.opset1.ops import fake_quantize
+from ngraph.opset1.ops import floor
+from ngraph.opset1.ops import floor_mod
+from ngraph.opset7.ops import gather
+from ngraph.opset6.ops import gather_elements
+from ngraph.opset5.ops import gather_nd
+from ngraph.opset1.ops import gather_tree
+from ngraph.opset7.ops import gelu
+from ngraph.opset1.ops import greater
+from ngraph.opset1.ops import greater_equal
+from ngraph.opset1.ops import grn
+from ngraph.opset1.ops import group_convolution
+from ngraph.opset1.ops import group_convolution_backprop_data
+from ngraph.opset3.ops import gru_cell
+from ngraph.opset5.ops import gru_sequence
+from ngraph.opset1.ops import hard_sigmoid
+from ngraph.opset5.ops import hsigmoid
+from ngraph.opset4.ops import hswish
+from ngraph.opset7.ops import idft
+from ngraph.opset1.ops import interpolate
+from ngraph.opset1.ops import less
+from ngraph.opset1.ops import less_equal
+from ngraph.opset1.ops import log
+from ngraph.opset1.ops import logical_and
+from ngraph.opset1.ops import logical_not
+from ngraph.opset1.ops import logical_or
+from ngraph.opset1.ops import logical_xor
+from ngraph.opset5.ops import log_softmax
+from ngraph.opset5.ops import loop
+from ngraph.opset1.ops import lrn
+from ngraph.opset4.ops import lstm_cell
+from ngraph.opset1.ops import lstm_sequence
+from ngraph.opset1.ops import matmul
+from ngraph.opset1.ops import max_pool
+from ngraph.opset1.ops import maximum
+from ngraph.opset1.ops import minimum
+from ngraph.opset4.ops import mish
+from ngraph.opset1.ops import mod
+from ngraph.opset1.ops import multiply
+from ngraph.opset6.ops import mvn
+from ngraph.opset1.ops import negative
+from ngraph.opset5.ops import non_max_suppression
+from ngraph.opset3.ops import non_zero
+from ngraph.opset1.ops import normalize_l2
+from ngraph.opset1.ops import not_equal
+from ngraph.opset1.ops import one_hot
+from ngraph.opset1.ops import pad
+from ngraph.opset1.ops import parameter
+from ngraph.opset1.ops import power
+from ngraph.opset1.ops import prelu
+from ngraph.opset1.ops import prior_box
+from ngraph.opset1.ops import prior_box_clustered
+from ngraph.opset1.ops import psroi_pooling
+from ngraph.opset4.ops import proposal
+from ngraph.opset1.ops import range
+from ngraph.opset3.ops import read_value
+from ngraph.opset4.ops import reduce_l1
+from ngraph.opset4.ops import reduce_l2
+from ngraph.opset1.ops import reduce_logical_and
+from ngraph.opset1.ops import reduce_logical_or
+from ngraph.opset1.ops import reduce_max
+from ngraph.opset1.ops import reduce_mean
+from ngraph.opset1.ops import reduce_min
+from ngraph.opset1.ops import reduce_prod
+from ngraph.opset1.ops import reduce_sum
+from ngraph.opset1.ops import region_yolo
+from ngraph.opset2.ops import reorg_yolo
+from ngraph.opset1.ops import relu
+from ngraph.opset1.ops import reshape
+from ngraph.opset1.ops import result
+from ngraph.opset1.ops import reverse_sequence
+from ngraph.opset3.ops import rnn_cell
+from ngraph.opset5.ops import rnn_sequence
+from ngraph.opset3.ops import roi_align
+from ngraph.opset2.ops import roi_pooling
+from ngraph.opset7.ops import roll
+from ngraph.opset5.ops import round
+from ngraph.opset3.ops import scatter_elements_update
+from ngraph.opset3.ops import scatter_update
+from ngraph.opset1.ops import select
+from ngraph.opset1.ops import selu
+from ngraph.opset3.ops import shape_of
+from ngraph.opset3.ops import shuffle_channels
+from ngraph.opset1.ops import sigmoid
+from ngraph.opset1.ops import sign
+from ngraph.opset1.ops import sin
+from ngraph.opset1.ops import sinh
+from ngraph.opset1.ops import softmax
+from ngraph.opset4.ops import softplus
+from ngraph.opset2.ops import space_to_batch
+from ngraph.opset1.ops import space_to_depth
+from ngraph.opset1.ops import split
+from ngraph.opset1.ops import sqrt
+from ngraph.opset1.ops import squared_difference
+from ngraph.opset1.ops import squeeze
+from ngraph.opset1.ops import strided_slice
+from ngraph.opset1.ops import subtract
+from ngraph.opset4.ops import swish
+from ngraph.opset1.ops import tan
+from ngraph.opset1.ops import tanh
+from ngraph.opset1.ops import tensor_iterator
+from ngraph.opset1.ops import tile
+from ngraph.opset3.ops import topk
+from ngraph.opset1.ops import transpose
+from ngraph.opset1.ops import unsqueeze
+from ngraph.opset1.ops import variadic_split
diff --git a/ngraph/python/src/ngraph/opset8/ops.py b/ngraph/python/src/ngraph/opset8/ops.py
new file mode 100644
index 00000000000..6ef5990f079
--- /dev/null
+++ b/ngraph/python/src/ngraph/opset8/ops.py
@@ -0,0 +1,45 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""Factory functions for all ngraph ops."""
+from functools import partial
+from typing import Callable, Iterable, List, Optional, Set, Union
+
+import numpy as np
+from ngraph.impl import Node, Shape
+from ngraph.impl.op import Constant, Parameter
+from ngraph.opset_utils import _get_node_factory
+from ngraph.utils.decorators import binary_op, nameable_op, unary_op
+from ngraph.utils.input_validation import (
+    assert_list_of_ints,
+    check_valid_attributes,
+    is_non_negative_value,
+    is_positive_value,
+)
+from ngraph.utils.node_factory import NodeFactory
+from ngraph.utils.tensor_iterator_types import (
+    GraphBody,
+    TensorIteratorSliceInputDesc,
+    TensorIteratorMergedInputDesc,
+    TensorIteratorInvariantInputDesc,
+    TensorIteratorBodyOutputDesc,
+    TensorIteratorConcatOutputDesc,
+)
+from ngraph.utils.types import (
+    NodeInput,
+    NumericData,
+    NumericType,
+    ScalarData,
+    TensorShape,
+    as_node,
+    as_nodes,
+    get_dtype,
+    get_element_type,
+    get_element_type_str,
+    make_constant_node,
+)
+
+_get_node_factory_opset8 = partial(_get_node_factory, "opset8")
+
+
+# -------------------------------------------- ops ------------------------------------------------
diff --git a/ngraph/python/src/ngraph/utils/node_factory.py b/ngraph/python/src/ngraph/utils/node_factory.py
index bfac8c5bdca..67e1825a4b3 100644
--- a/ngraph/python/src/ngraph/utils/node_factory.py
+++ b/ngraph/python/src/ngraph/utils/node_factory.py
@@ -8,7 +8,7 @@ from _pyngraph import NodeFactory as _NodeFactory
 
 from ngraph.impl import Node, Output
 
-DEFAULT_OPSET = "opset6"
+DEFAULT_OPSET = "opset8"
 
 
 class NodeFactory(object):
diff --git a/ngraph/python/src/pyngraph/node_factory.cpp b/ngraph/python/src/pyngraph/node_factory.cpp
index 0e90989dc3e..0f3a10dde69 100644
--- a/ngraph/python/src/pyngraph/node_factory.cpp
+++ b/ngraph/python/src/pyngraph/node_factory.cpp
@@ -75,6 +75,7 @@ namespace
                 {"opset5", OpsetFunction(ngraph::get_opset5)},
                 {"opset6", OpsetFunction(ngraph::get_opset6)},
                 {"opset7", OpsetFunction(ngraph::get_opset7)},
+                {"opset8", OpsetFunction(ngraph::get_opset8)},
             };
 
             auto it = s_opsets.find(opset_ver);
diff --git a/ngraph/test/CMakeLists.txt b/ngraph/test/CMakeLists.txt
index f516c0d0708..8147e9008bd 100644
--- a/ngraph/test/CMakeLists.txt
+++ b/ngraph/test/CMakeLists.txt
@@ -96,6 +96,7 @@ set(SRC
     type_prop/adaptive_max_pool.cpp
     type_prop/asin.cpp
     type_prop/assign.cpp
+    type_prop/atan.cpp
     type_prop/avg_pool.cpp
     type_prop/batch_norm.cpp
     type_prop/batch_to_space.cpp
@@ -156,11 +157,13 @@ set(SRC
     type_prop/lstm_sequence.cpp
     type_prop/loop.cpp
     type_prop/matmul.cpp
+    type_prop/matrix_nms.cpp    
     type_prop/maximum.cpp
     type_prop/max_pool.cpp
     type_prop/minimum.cpp
     type_prop/mish.cpp
     type_prop/mod.cpp
+    type_prop/multiclass_nms.cpp
     type_prop/mvn.cpp
     type_prop/negative.cpp
     type_prop/non_max_suppression.cpp
@@ -171,6 +174,7 @@ set(SRC
     type_prop/parameter.cpp
     type_prop/power.cpp
     type_prop/prelu.cpp
+    type_prop/prior_box.cpp
     type_prop/proposal.cpp
     type_prop/psroi_pooling.cpp
     type_prop/range.cpp
@@ -204,6 +208,7 @@ set(SRC
     type_prop/shape_of.cpp
     type_prop/shuffle_channels.cpp
     type_prop/sin.cpp
+    type_prop/sinh.cpp
     type_prop/softmax.cpp
     type_prop/softplus.cpp
     type_prop/space_to_batch.cpp
@@ -226,7 +231,9 @@ set(SRC
     visitors/value_map.cpp
     visitors/op/adaptive_avg_pool.cpp
     visitors/op/adaptive_max_pool.cpp
+    visitors/op/atan.cpp
     visitors/op/batch_norm.cpp
+    visitors/op/batch_to_space.cpp
     visitors/op/broadcast.cpp
     visitors/op/bucketize.cpp
     visitors/op/ceiling.cpp
@@ -255,9 +262,11 @@ set(SRC
     visitors/op/lstm_cell.cpp
     visitors/op/lstm_sequence.cpp
     visitors/op/matmul.cpp
+    visitors/op/matrix_nms.cpp
     visitors/op/max_pool.cpp
     visitors/op/mish.cpp
     visitors/op/mod.cpp
+    visitors/op/multiclass_nms.cpp
     visitors/op/mvn.cpp
     visitors/op/negative.cpp
     visitors/op/non_max_suppression.cpp
@@ -287,8 +296,10 @@ set(SRC
     visitors/op/rnn_cell.cpp
     visitors/op/roi_pooling.cpp
     visitors/op/round.cpp
+    visitors/op/space_to_depth.cpp
     visitors/op/selu.cpp
     visitors/op/shuffle_channels.cpp
+    visitors/op/sinh.cpp
     visitors/op/softmax.cpp
     visitors/op/softplus.cpp
     visitors/op/space_to_batch.cpp
@@ -355,6 +366,8 @@ set(MULTI_TEST_SRC
     backend/abs.in.cpp
     backend/acos.in.cpp
     backend/acosh.in.cpp
+    backend/adaptive_avg_pool.in.cpp
+    backend/adaptive_max_pool.in.cpp
     backend/add.in.cpp
     backend/aliased_output.in.cpp
     backend/api.in.cpp
@@ -365,6 +378,7 @@ set(MULTI_TEST_SRC
     backend/auto_broadcast.in.cpp
     backend/avg_pool.in.cpp
     backend/batch_norm.in.cpp
+    backend/batch_to_space.in.cpp
     backend/broadcast.in.cpp
     backend/bucketize.in.cpp
     backend/builder_reduce_ops_opset1.in.cpp
@@ -420,11 +434,13 @@ set(MULTI_TEST_SRC
     backend/logical_xor.in.cpp
     backend/lrn.in.cpp
     backend/matmul.in.cpp
+    backend/matrix_nms.in.cpp
     backend/maximum.in.cpp
     backend/max_pool.in.cpp
     backend/minimum.in.cpp
     backend/mish.in.cpp
     backend/mod.in.cpp
+    backend/multiclass_nms.in.cpp
     backend/multiple_backends.in.cpp
     backend/multiple_result.in.cpp
     backend/multiply.in.cpp
@@ -440,6 +456,7 @@ set(MULTI_TEST_SRC
     backend/parameter_as_output.in.cpp
     backend/power.in.cpp
     backend/prelu.in.cpp
+    backend/prior_box.in.cpp
     backend/proposal.in.cpp
     backend/psroi_pooling.in.cpp
     backend/range.in.cpp
@@ -462,6 +479,7 @@ set(MULTI_TEST_SRC
     backend/roll.in.cpp
     backend/round.in.cpp
     backend/scatter_nd_update.in.cpp
+    backend/space_to_depth.in.cpp
     backend/select.in.cpp
     backend/selu.in.cpp
     backend/shape_of.in.cpp
@@ -528,27 +546,9 @@ set(SRC ${FRONTEND_SHARED_TESTS_SRC} ${SRC})
 
 # ---- PaddlePaddle FrontEnd testing ------
 if (NGRAPH_PDPD_FRONTEND_ENABLE)
-    find_package (PythonInterp 3 REQUIRED)
-    set(PDPD_PYTHON_OK TRUE)
-    if(NOT PYTHON_EXECUTABLE)
-        message("Python3 is required to build the PDPD frontend unit tests")
-        set(PDPD_PYTHON_OK FALSE)
-    endif()
+    ie_check_pip_package(paddlepaddle WARNING)
 
-    if (PDPD_PYTHON_OK)
-        execute_process(
-                COMMAND ${PYTHON_EXECUTABLE} -m pip show paddlepaddle
-                RESULT_VARIABLE PIP_EXIT_CODE
-                OUTPUT_QUIET
-        )
-
-        if (NOT ${PIP_EXIT_CODE} EQUAL 0)
-            message("Python paddlepaddle package is not installed. Please use \"pip install paddlepaddle==2.0.1\".")
-            set(PDPD_PYTHON_OK FALSE)
-        endif()
-    endif()
-
-    if (PDPD_PYTHON_OK)
+    if(paddlepaddle_FOUND)
         file(GLOB FRONTEND_PDPD_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/frontend/paddlepaddle/*.cpp)
         set(SRC ${FRONTEND_PDPD_TESTS_SRC} ${SRC})
         set(TEST_PDPD_MODELS ${CMAKE_CURRENT_BINARY_DIR}/pdpd_test_models/)
@@ -651,7 +651,7 @@ add_subdirectory(frontend)
 ### END FRONTEND ###
 
 #PaddlePaddle - test models generator
-if (NGRAPH_PDPD_FRONTEND_ENABLE AND PDPD_PYTHON_OK)
+if (NGRAPH_PDPD_FRONTEND_ENABLE AND paddlepaddle_FOUND)
     file(GLOB_RECURSE PDPD_GEN_SCRIPTS ${CMAKE_CURRENT_SOURCE_DIR}/files/paddlepaddle/gen_scripts/generate_*.py)
     set(OUT_FILES "")
     foreach(GEN_SCRIPT ${PDPD_GEN_SCRIPTS})
@@ -671,4 +671,3 @@ if (NGRAPH_PDPD_FRONTEND_ENABLE AND PDPD_PYTHON_OK)
     add_dependencies(unit-test pdpd_test_models)
     add_dependencies(unit-test paddlepaddle_ngraph_frontend)
 endif()
-
diff --git a/ngraph/test/backend/adaptive_avg_pool.in.cpp b/ngraph/test/backend/adaptive_avg_pool.in.cpp
new file mode 100644
index 00000000000..eeb8c5ea0f9
--- /dev/null
+++ b/ngraph/test/backend/adaptive_avg_pool.in.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, adaptive_avg_pool_1d)
+{
+    auto data = make_shared<op::Parameter>(element::f32, Shape{2, 3, 7});
+    auto output_shape = op::Constant::create<int64_t>(element::i64, Shape{1}, {3});
+    auto adaptive_pool = make_shared<op::v8::AdaptiveAvgPool>(data, output_shape);
+    auto fun = make_shared<Function>(OutputVector{adaptive_pool}, ParameterVector{data});
+
+    std::vector<float> inputs{ 0,  4,  1,  3, -2, -5, -2,
+                              -2,  1, -3,  1, -3, -4,  0,
+                              -2,  1, -1, -2,  3, -1, -3,
+
+                              -1, -2,  3,  4, -3, -4,  1,
+                               2,  0, -4, -5, -2, -2, -3,
+                               2,  3,  1, -5,  2, -4, -2};
+    std::vector<float> expected_result{ 1.66666663,  0.66666669, -3.        ,
+                                       -1.33333337, -1.66666663, -2.33333325,
+                                       -0.66666669,  0.        , -0.33333334,
+
+                                        0.        ,  1.33333337, -2.        ,
+                                       -0.66666669, -3.66666675, -2.33333325,
+                                        2.        , -0.66666669, -1.33333337};
+
+    auto test_case = test::TestCase<TestEngine>(fun);
+    test_case.add_input<float>(Shape{2, 3, 7}, inputs);
+    test_case.add_expected_output<float>(Shape{2, 3, 3}, expected_result);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, adaptive_avg_pool_2d)
+{
+    auto data = make_shared<op::Parameter>(element::f32, Shape{1, 3, 7, 10});
+    auto output_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {3, 3});
+    auto adaptive_pool = make_shared<op::v8::AdaptiveAvgPool>(data, output_shape);
+    auto fun = make_shared<Function>(OutputVector{adaptive_pool}, ParameterVector{data});
+
+    std::vector<float> inputs{-2, -3, -4,  3, -5,  4,  0, -4, -2, -4,
+                              -5,  0, -3,  0, -2,  0,  0, -5, -4, -1,
+                               3, -1,  0, -1,  0, -2,  0,  4,  1,  4,
+                               0, -1, -4,  2, -2, -5, -1, -1, -2,  1,
+                               2, -2, -1,  2,  0, -1,  0, -5,  4,  4,
+                               3,  0, -4, -4, -4, -2,  0,  1, -2, -1,
+                               4, -2, -4,  1, -1, -3, -4, -1,  1, -4,
+
+                              -2, -4, -5,  0, -4,  3,  4, -5, -4, -2,
+                               0,  2, -4, -3,  3, -1,  1, -4, -5,  4,
+                               2, -5,  2, -3,  0,  4,  3,  3,  1,  2,
+                              -1, -4,  1, -3, -3, -2,  3,  4, -2, -5,
+                               1,  4,  4, -2,  2,  1, -5, -2, -5,  1,
+                               1, -2, -3, -3, -1, -5,  1, -3, -5, -3,
+                              -4, -1,  4, -3,  4, -1,  4,  3,  1,  4,
+
+                              -2, -4, -4,  4, -3,  4,  2, -3, -2,  4,
+                              -3,  0,  1, -4,  4,  4,  0,  3, -1,  3,
+                               3, -5,  0,  3, -3,  1, -2,  4, -5, -5,
+                               1,  0, -1,  0, -3, -2,  0, -3,  3, -2,
+                              -2,  0, -3,  4, -1,  2, -2,  2, -3, -1,
+                              -4, -2,  0,  2,  0,  2,  0, -3,  4,  3,
+                              -5, -3, -5,  1, -5, -3, -5,  4, -3,  3};
+    std::vector<float> expected_result{-1.08333337, -0.25000000, -0.91666669,
+                                       -0.08333334, -0.66666669,  0.75000000,
+                                       -0.41666666, -1.33333337, -0.58333331,
+
+                                       -1.66666663,  0.58333331, -0.16666667,
+                                       -0.33333334, -0.41666666, -0.16666667,
+                                       -0.33333334, -0.66666669, -0.75000000,
+
+                                       -0.91666669,  0.83333331, -0.16666667,
+                                        0.        , -0.25000000, -1.16666663,
+                                       -1.41666663, -0.41666666, -0.08333334};
+
+    auto test_case = test::TestCase<TestEngine>(fun);
+    test_case.add_input<float>(Shape{1, 3, 7, 10}, inputs);
+    test_case.add_expected_output<float>(Shape{1, 3, 3, 3}, expected_result);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, adaptive_avg_pool_3d)
+{
+    auto data = make_shared<op::Parameter>(element::f32, Shape{2, 2, 3, 3, 3});
+    auto output_shape = op::Constant::create<int64_t>(element::i64, Shape{3}, {2, 2, 2});
+    auto adaptive_pool = make_shared<op::v8::AdaptiveAvgPool>(data, output_shape);
+    auto fun = make_shared<Function>(OutputVector{adaptive_pool}, ParameterVector{data});
+
+    std::vector<float> inputs{-5,  1, -3, -4,  4, -4,  3, -3, -1,
+                               0,  0, -2, -4,  2,  0, -4, -5, -2,
+                              -4, -4,  0, -2,  3, -3,  4, -1, -4,
+
+                              -1, -1, -5,  4, -1, -2, -3,  0,  4,
+                              -1, -5, -4,  1,  1,  4, -5, -5, -5,
+                               4, -3, -3, -3,  4,  0, -3, -5,  1,
+
+                               4,  2,  1, -5, -5,  1,  0, -4, -1,
+                               2, -4, -2,  4,  3,  1, -3, -3, -2,
+                              -4, -3, -3,  3, -1,  1,  2,  2, -4,
+
+                              -5, -4,  1,  3, -4, -1,  2,  4, -5,
+                               0,  1, -2,  0,  0, -2,  3, -2, -5,
+                              -3, -5, -2, -1,  3, -2,  4,  3, -3};
+    std::vector<float> expected_result{-0.750, -0.250, -1.375, -1.125,
+                                       -1.125, -0.500, -0.875, -1.250,
+
+                                       -0.375, -1.625, -1.   , -0.500,
+                                       -0.250, -0.750, -1.875, -0.625,
+
+                                        0.125, -0.375, -1.625, -1.250,
+                                        0.   , -1.   ,  0.875, -0.375,
+
+                                       -1.125, -1.375,  0.750, -1.875,
+                                       -0.625, -1.125,  1.250, -1.};
+
+    auto test_case = test::TestCase<TestEngine>(fun);
+    test_case.add_input<float>(Shape{2, 2, 3, 3, 3}, inputs);
+    test_case.add_expected_output<float>(Shape{2, 2, 2, 2, 2}, expected_result);
+    test_case.run();
+}
diff --git a/ngraph/test/backend/adaptive_max_pool.in.cpp b/ngraph/test/backend/adaptive_max_pool.in.cpp
new file mode 100644
index 00000000000..a2685f3ad2a
--- /dev/null
+++ b/ngraph/test/backend/adaptive_max_pool.in.cpp
@@ -0,0 +1,161 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, adaptive_max_pool_1d)
+{
+    auto data = make_shared<op::Parameter>(element::f32, Shape{2, 3, 7});
+    auto output_shape = op::Constant::create<int64_t>(element::i64, Shape{1}, {3});
+    auto adaptive_pool = make_shared<op::v8::AdaptiveMaxPool>(data, output_shape);
+    auto fun = make_shared<Function>(adaptive_pool->outputs(), ParameterVector{data});
+
+    std::vector<float> inputs{ 0,  4,  1,  3, -2, -5, -2,
+                              -2,  1, -3,  1, -3, -4,  0,
+                              -2,  1, -1, -2,  3, -1, -3,
+
+                              -1, -2,  3,  4, -3, -4,  1,
+                               2,  0, -4, -5, -2, -2, -3,
+                               2,  3,  1, -5,  2, -4, -2};
+    std::vector<float> expected_result{ 4,  3, -2,
+                                        1,  1,  0,
+                                        1,  3,  3,
+
+                                        3,  4,  1,
+                                        2, -2, -2,
+                                        3,  2,  2};
+    std::vector<int64_t> expected_indices{1, 3, 4,
+                                          1, 3, 6,
+                                          1, 4, 4,
+
+                                          2, 3, 6,
+                                          0, 4, 4,
+                                          1, 4, 4};
+    auto test_case = test::TestCase<TestEngine>(fun);
+    test_case.add_input<float>(Shape{2, 3, 7}, inputs);
+    test_case.add_expected_output<float>(Shape{2, 3, 3}, expected_result);
+    test_case.add_expected_output<int64_t>(Shape{2, 3, 3}, expected_indices);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, adaptive_max_pool_2d)
+{
+    auto data = make_shared<op::Parameter>(element::f32, Shape{1, 3, 7, 10});
+    auto output_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {3, 3});
+    auto adaptive_pool = make_shared<op::v8::AdaptiveMaxPool>(data, output_shape);
+    auto fun = make_shared<Function>(adaptive_pool->outputs(), ParameterVector{data});
+
+    std::vector<float> inputs{ 0, -2, -5, -5,  2,  3,  2, -3,  1, -2,
+                              -4, -1, -1, -1,  2, -4,  3, -5, -1, -1,
+                               1,  2,  4, -2, -3, -2,  0, -5,  2, -4,
+                              -1, -4,  4,  2,  1, -2,  2, -3,  0,  1,
+                              -3,  3, -1,  4,  0,  2,  0,  3,  4, -4,
+                               1,  4, -1, -5, -2,  4, -3,  3,  2,  1,
+                               0,  4,  2, -5,  2, -5, -2, -1,  4,  2,
+
+                               0,  4, -2,  0, -5, -3,  4, -4, -2, -2,
+                               2,  1,  4,  3,  2, -5, -4, -4,  0,  1,
+                               4, -4, -3,  3,  3,  4, -2, -3, -4, -2,
+                               0,  1, -1,  3, -2,  2,  0, -3, -1, -1,
+                               0,  0,  2,  2, -2,  1, -3,  1,  2,  4,
+                               3, -5, -4,  1, -4,  2,  0, -2, -5,  2,
+                              -3, -2, -3, -4,  2, -2, -4,  2, -4, -3,
+
+                               1, -5, -1, -5,  2,  1,  3,  4,  3,  0,
+                              -5,  4, -3, -4, -1,  2, -4,  2,  0, -5,
+                              -3,  0,  2, -3, -5,  3, -2, -1, -5, -4,
+                              -5,  0, -5, -1, -3,  3,  3, -4, -3, -4,
+                              -5,  4, -1,  1, -1, -4,  1, -3, -4, -1,
+                              -2, -3, -5,  2,  2, -5,  1,  1, -5, -4,
+                               0,  2,  4,  2,  0,  2,  4,  0, -5,  2};
+    std::vector<float> expected_result{4, 3, 3,
+                                       4, 4, 4,
+                                       4, 4, 4,
+
+                                       4, 4, 4,
+                                       4, 4, 4,
+                                       3, 2, 4,
+
+                                       4, 3, 4,
+                                       4, 3, 3,
+                                       4, 4, 4};
+    std::vector<int64_t> expected_indices{22, 5 , 16,
+                                          22, 43, 48,
+                                          43, 43, 48,
+
+                                          1 , 6 , 6 ,
+                                          20, 25, 49,
+                                          50, 43, 49,
+
+                                          11, 6 , 7 ,
+                                          41, 25, 36,
+                                          41, 66, 66};
+
+    auto test_case = test::TestCase<TestEngine>(fun);
+    test_case.add_input<float>(Shape{1, 3, 7, 10}, inputs);
+    test_case.add_expected_output<float>(Shape{1, 3, 3, 3}, expected_result);
+    test_case.add_expected_output<int64_t>(Shape{1, 3, 3, 3}, expected_indices);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, adaptive_max_pool_3d)
+{
+    auto data = make_shared<op::Parameter>(element::f32, Shape{2, 2, 3, 3, 3});
+    auto output_shape = op::Constant::create<int64_t>(element::i64, Shape{3}, {2, 2, 2});
+    auto adaptive_pool = make_shared<op::v8::AdaptiveMaxPool>(data, output_shape);
+    auto fun = make_shared<Function>(adaptive_pool->outputs(), ParameterVector{data});
+
+    std::vector<float> inputs{-5,  1, -3, -4,  4, -4,  3, -3, -1,
+                               0,  0, -2, -4,  2,  0, -4, -5, -2,
+                              -4, -4,  0, -2,  3, -3,  4, -1, -4,
+
+                              -1, -1, -5,  4, -1, -2, -3,  0,  4,
+                              -1, -5, -4,  1,  1,  4, -5, -5, -5,
+                               4, -3, -3, -3,  4,  0, -3, -5,  1,
+
+                               4,  2,  1, -5, -5,  1,  0, -4, -1,
+                               2, -4, -2,  4,  3,  1, -3, -3, -2,
+                              -4, -3, -3,  3, -1,  1,  2,  2, -4,
+
+                              -5, -4,  1,  3, -4, -1,  2,  4, -5,
+                               0,  1, -2,  0,  0, -2,  3, -2, -5,
+                              -3, -5, -2, -1,  3, -2,  4,  3, -3};
+    std::vector<float> expected_result{4, 4, 4, 4,
+                                       3, 3, 4, 3,
+
+                                       4, 4, 4, 4,
+                                       4, 4, 4, 4,
+
+                                       4, 3, 4, 3,
+                                       4, 3, 4, 3,
+
+                                       3, 1, 4, 4,
+                                       3, 3, 4, 3};
+    std::vector<int64_t> expected_indices{4 , 4 , 4 , 4 ,
+                                          22, 22, 24, 22,
+
+                                          3 , 14, 3 , 8 ,
+                                          18, 14, 22, 14,
+
+                                          0 , 13, 12, 13,
+                                          12, 13, 12, 13,
+
+                                          3 , 2 , 7 , 7 ,
+                                          22, 22, 24, 22};
+
+    auto test_case = test::TestCase<TestEngine>(fun);
+    test_case.add_input<float>(Shape{2, 2, 3, 3, 3}, inputs);
+    test_case.add_expected_output<float>(Shape{2, 2, 2, 2, 2}, expected_result);
+    test_case.add_expected_output<int64_t>(Shape{2, 2, 2, 2, 2}, expected_indices);
+    test_case.run();
+}
diff --git a/ngraph/test/backend/atan.in.cpp b/ngraph/test/backend/atan.in.cpp
index 8a8f29bd0ab..5c5e4f79bfc 100644
--- a/ngraph/test/backend/atan.in.cpp
+++ b/ngraph/test/backend/atan.in.cpp
@@ -31,7 +31,7 @@ using namespace ngraph;
 static string s_manifest = "${MANIFEST}";
 using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
 
-NGRAPH_TEST(${BACKEND_NAME}, atan)
+NGRAPH_TEST(${BACKEND_NAME}, atan_float)
 {
     Shape shape{11};
     auto A = make_shared<op::Parameter>(element::f32, shape);
@@ -53,3 +53,16 @@ NGRAPH_TEST(${BACKEND_NAME}, atan)
                                           1.32581766f});
     test_case.run();
 }
+
+NGRAPH_TEST(${BACKEND_NAME}, atan_int)
+{
+    Shape shape{5};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    auto f = make_shared<Function>(make_shared<op::Atan>(A), ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int32_t>({-2, -1, 0, 1, 2});
+    test_case.add_expected_output<int32_t>(shape,
+                                         {-1, -1, 0, 1, 1});
+    test_case.run();
+}
diff --git a/ngraph/test/backend/batch_to_space.in.cpp b/ngraph/test/backend/batch_to_space.in.cpp
new file mode 100644
index 00000000000..ac6f07e3002
--- /dev/null
+++ b/ngraph/test/backend/batch_to_space.in.cpp
@@ -0,0 +1,179 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/ndarray.hpp"
+#include "util/test_case.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+namespace
+{
+    template<typename dataType>
+    struct BatchToSpaceParams
+    {
+        using Data = test::NDArrayBase<dataType>;
+        using BlockShape = test::NDArrayBase<int64_t>;
+        using Crops = test::NDArrayBase<int64_t>;
+
+        BatchToSpaceParams(Data in_data,
+                           BlockShape block_shape,
+                           Crops crops_begin,
+                           Crops crops_end,
+                           Data expected_output)
+                        : m_data{std::move(in_data)}
+                        , m_block_shape{std::move(block_shape)}
+                        , m_crops_begin{std::move(crops_begin)}
+                        , m_crops_end{std::move(crops_end)}
+                        , m_expected_output{std::move(expected_output)}
+        {
+        }
+
+        Data m_data;
+        BlockShape m_block_shape;
+        Crops m_crops_begin;
+        Crops m_crops_end;
+        Data m_expected_output;
+    };
+
+    template <typename dataType>
+    static void BatchToSpaceTestExecute(const BatchToSpaceParams<dataType>& params)
+    {
+        const auto data =
+            make_shared<op::Parameter>(element::from<dataType>(), params.m_data.get_shape());
+
+        const auto block_shape = op::Constant::create(
+            element::i64, params.m_block_shape.get_shape(), params.m_block_shape.get_vector());
+
+        const auto crops_begin = op::Constant::create(
+            element::i64, params.m_crops_begin.get_shape(), params.m_crops_begin.get_vector());
+
+        const auto crops_end = op::Constant::create(
+            element::i64, params.m_crops_end.get_shape(), params.m_crops_end.get_vector());
+
+        const auto batch_to_space =
+            make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+
+        auto f = make_shared<Function>(batch_to_space, ParameterVector{data});
+        auto test_case = test::TestCase<TestEngine>(f);
+        test_case.add_input(params.m_data.get_vector());
+        test_case.add_expected_output(params.m_expected_output.get_vector());
+        test_case.run_with_tolerance_as_fp(1e-4f);
+    }
+
+    class BatchToSpaceTestFloat : public testing::TestWithParam<BatchToSpaceParams<float>>
+    {
+    };
+}   // namespace
+
+NGRAPH_TEST_P(${BACKEND_NAME}, BatchToSpaceTestFloat, BatchToSpaceTestFloatCases)
+{
+    BatchToSpaceTestExecute(GetParam());
+}
+
+const test::NDArray<float, 4> input_with_shape_4x1x1x3(
+    {{{{1.0f, 2.0f, 3.0f}}},
+    {{{4.0f, 5.0f, 6.0f}}},
+    {{{7.0f, 8.0f, 9.0f}}},
+    {{{10.0f, 11.0f, 12.0f}}}});
+
+const test::NDArray<float, 4> input_with_shape_4x1x2x3(
+    {{{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}},
+    {{{7.0f, 8.0f, 9.0f}, {10.0f, 11.0f, 12.0f}}},
+    {{{13.0f, 14.0f, 15.0f}, {16.0f, 17.0f, 18.0f}}},
+    {{{19.0f, 20.0f, 21.0f}, {22.0f, 23.0f, 24.0f}}}});
+
+const test::NDArray<int64_t, 1> zero_crops_4d({0, 0, 0, 0});
+
+NGRAPH_INSTANTIATE_TEST_SUITE_P(
+    ${BACKEND_NAME},
+    batch_to_space_4d_without_crops,
+    BatchToSpaceTestFloat,
+    testing::Values(
+        BatchToSpaceParams<float>{input_with_shape_4x1x1x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 1, 2}),
+                                  zero_crops_4d,
+                                  zero_crops_4d,
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f}}},
+                                      {{{4.0f, 10.0f, 5.0f, 11.0f, 6.0f, 12.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x1x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 1}),
+                                  zero_crops_4d,
+                                  zero_crops_4d,
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 2.0f, 3.0f}, {7.0f, 8.0f, 9.0f}}},
+                                      {{{4.0f, 5.0f, 6.0f}, {10.0f, 11.0f, 12.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x1x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 2}),
+                                  zero_crops_4d,
+                                  zero_crops_4d,
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f},
+                                      {7.0f, 10.0f, 8.0f, 11.0f, 9.0f, 12.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x2x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 1, 2}),
+                                  zero_crops_4d,
+                                  zero_crops_4d,
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 13.0f, 2.0f, 14.0f, 3.0f, 15.0f},
+                                         {4.0f, 16.0f, 5.0f, 17.0f, 6.0f, 18.0f}}},
+                                       {{{7.0f, 19.0f, 8.0f, 20.0f, 9.0f, 21.0f},
+                                         {10.0f, 22.0f, 11.0f, 23.0f, 12.0f, 24.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x2x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 1}),
+                                  zero_crops_4d,
+                                  zero_crops_4d,
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 2.0f, 3.0f}, {13.0f, 14.0f, 15.0f},
+                                         {4.0f, 5.0f, 6.0f}, {16.0f, 17.0f, 18.0f}}},
+                                       {{{7.0f, 8.0f, 9.0f}, {19.0f, 20.0f, 21.0f},
+                                         {10.0f, 11.0f, 12.0f}, {22.0f, 23.0f, 24.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x2x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 2}),
+                                  zero_crops_4d,
+                                  zero_crops_4d,
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f},
+                                         {13.0f, 19.0f, 14.0f, 20.0f, 15.0f, 21.0f},
+                                         {4.0f, 10.0f, 5.0f, 11.0f, 6.0f, 12.0f},
+                                         {16.0f, 22.0f, 17.0f, 23.0f, 18.0f, 24.0f}}}})}));
+
+NGRAPH_INSTANTIATE_TEST_SUITE_P(
+    ${BACKEND_NAME},
+    batch_to_space_4d_crops,
+    BatchToSpaceTestFloat,
+    testing::Values(
+        BatchToSpaceParams<float>{input_with_shape_4x1x2x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 2}),
+                                  test::NDArray<int64_t, 1>({0, 0, 0, 0}),
+                                  test::NDArray<int64_t, 1>({0, 0, 0, 2}),
+                                  test::NDArray<float, 4>(
+                                      {{{{1.0f, 7.0f, 2.0f, 8.0f},
+                                         {13.0f, 19.0f, 14.0f, 20.0f},
+                                         {4.0f, 10.0f, 5.0f, 11.0f},
+                                         {16.0f, 22.0f, 17.0f, 23.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x2x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 2}),
+                                  test::NDArray<int64_t, 1>({0, 0, 0, 2}),
+                                  test::NDArray<int64_t, 1>({0, 0, 0, 0}),
+                                  test::NDArray<float, 4>(
+                                      {{{{2.0f, 8.0f, 3.0f, 9.0f},
+                                         {14.0f, 20.0f, 15.0f, 21.0f},
+                                         {5.0f, 11.0f, 6.0f, 12.0f},
+                                         {17.0f, 23.0f, 18.0f, 24.0f}}}})},
+        BatchToSpaceParams<float>{input_with_shape_4x1x2x3,
+                                  test::NDArray<int64_t, 1>({1, 1, 2, 2}),
+                                  test::NDArray<int64_t, 1>({0, 0, 1, 0}),
+                                  test::NDArray<int64_t, 1>({0, 0, 1, 0}),
+                                  test::NDArray<float, 4>(
+                                      {{{{13.0f, 19.0f, 14.0f, 20.0f, 15.0f, 21.0f},
+                                         {4.0f, 10.0f, 5.0f, 11.0f, 6.0f, 12.0f}}}})}));
diff --git a/ngraph/test/backend/fused_op.in.cpp b/ngraph/test/backend/fused_op.in.cpp
index d8da27bb704..4899bd2a3e7 100644
--- a/ngraph/test/backend/fused_op.in.cpp
+++ b/ngraph/test/backend/fused_op.in.cpp
@@ -78,46 +78,6 @@ NGRAPH_TEST(${BACKEND_NAME}, hardsigmoid)
     test_case.run();
 }
 
-
-NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_block_first)
-{
-    auto A = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4, 4});
-    const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST;
-    auto space_to_depth = make_shared<op::SpaceToDepth>(A, mode, 2);
-    auto function = make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
-
-    auto test_case = test::TestCase<TestEngine>(function);
-    test_case.add_input<float>({0.f,  1.f,  2.f,  3.f,  4.f,  5.f,  6.f,  7.f,  8.f,  9.f,  10.f,
-                                11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f,
-                                22.f, 23.f, 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f});
-    test_case.add_expected_output<float>(Shape{1, 8, 2, 2},
-                                         {
-                                             0.f, 2.f, 8.f,  10.f, 16.f, 18.f, 24.f, 26.f,
-                                             1.f, 3.f, 9.f,  11.f, 17.f, 19.f, 25.f, 27.f,
-                                             4.f, 6.f, 12.f, 14.f, 20.f, 22.f, 28.f, 30.f,
-                                             5.f, 7.f, 13.f, 15.f, 21.f, 23.f, 29.f, 31.f,
-                                         });
-    test_case.run();
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_depth_first)
-{
-    auto A = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4, 4});
-    const auto mode = ngraph::op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST;
-    auto space_to_depth = make_shared<op::SpaceToDepth>(A, mode, 2);
-    auto function = make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
-
-    auto test_case = test::TestCase<TestEngine>(function);
-    test_case.add_input<float>({0.f,  16.f, 2.f,  18.f, 1.f,  17.f, 3.f,  19.f, 8.f,  24.f, 10.f,
-                                26.f, 9.f,  25.f, 11.f, 27.f, 4.f,  20.f, 6.f,  22.f, 5.f,  21.f,
-                                7.f,  23.f, 12.f, 28.f, 14.f, 30.f, 13.f, 29.f, 15.f, 31.f});
-    test_case.add_expected_output<float>(
-        Shape{1, 8, 2, 2}, {0.f,  2.f,  8.f,  10.f, 16.f, 18.f, 24.f, 26.f, 1.f,  3.f,  9.f,
-                            11.f, 17.f, 19.f, 25.f, 27.f, 4.f,  6.f,  12.f, 14.f, 20.f, 22.f,
-                            28.f, 30.f, 5.f,  7.f,  13.f, 15.f, 21.f, 23.f, 29.f, 31.f});
-    test_case.run();
-}
-
 // TODO: Issue: 37521
 NGRAPH_TEST(${BACKEND_NAME}, DISABLED_normalize_across_chw_4d)
 {
diff --git a/ngraph/test/backend/matrix_nms.in.cpp b/ngraph/test/backend/matrix_nms.in.cpp
new file mode 100644
index 00000000000..0c691831356
--- /dev/null
+++ b/ngraph/test/backend/matrix_nms.in.cpp
@@ -0,0 +1,667 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#ifdef ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
+#define DEFAULT_FLOAT_TOLERANCE_BITS ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
+#endif
+
+#ifdef ${BACKEND_NAME}_DOUBLE_TOLERANCE_BITS
+#define DEFAULT_DOUBLE_TOLERANCE_BITS ${BACKEND_NAME}_DOUBLE_TOLERANCE_BITS
+#endif
+// clang-format on
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_output_type_i64)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = 0;
+
+    const auto boxes_shape = Shape{1, 6, 4};  // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0, 3, 1};
+    std::vector<float> expected_selected_scores = {1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int64_t> expected_valid_outputs = {3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({3, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({3, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_output_type_i32)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = 0;
+
+    const auto boxes_shape = Shape{1, 6, 4}; // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+    attrs.output_type = ngraph::element::i32;
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int32_t> expected_selected_indices = {0, 3, 1};
+    std::vector<float> expected_selected_scores = {1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int32_t> expected_valid_outputs = {3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({3, 6}, expected_selected_scores);
+    test_case.add_expected_output<int32_t>({3, 1}, expected_selected_indices);
+    test_case.add_expected_output<int32_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_gaussian)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = 0;
+
+    const auto boxes_shape = Shape{1, 6, 4};  // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::GAUSSIAN;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0, 3, 1};
+    std::vector<float> expected_selected_scores = {1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.1966116, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int64_t> expected_valid_outputs = {3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({3, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({3, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_two_batches_two_classes)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; // 1
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3}; // 1
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = 0;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0, 3, 1,
+                                                      6, 9, 7};
+    std::vector<float> expected_selected_scores = {1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1,
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int64_t> expected_valid_outputs = {3, 3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({6, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({6, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_two_batches_two_classes_by_score_cross_batch)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; // 1
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3}; // 1
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.5;
+    attrs.sort_result_across_batch = true;
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 9, 6,
+                                                      0, 6, 3, 9};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00, //3
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00, //0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00, //9
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00, //6
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //0
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //6
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00, //3
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00}; // 9
+    std::vector<int64_t> expected_valid_outputs = {4, 4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({8, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({8, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_two_batches_two_classes_by_classid_cross_batch)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; // 1
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3}; // 1
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.5;
+    attrs.sort_result_across_batch = true;
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 9, 6,
+                                                      0, 3, 6, 9};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00, //3
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00, //9
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //6
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00, //0
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00, // 3
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00, //6
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00  }; // 9
+    std::vector<int64_t> expected_valid_outputs = {4, 4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({8, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({8, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_by_keep_top_k)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}; // 1
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3}; // 1
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::CLASSID;
+    attrs.keep_top_k = 3;
+    attrs.background_class = 0;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0, 3, 1,
+                                                      6, 9, 7};
+    std::vector<float> expected_selected_scores = {1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1,
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.8, 0.00, 10.00, 1.00, 11.00,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int64_t> expected_valid_outputs = {3, 3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({6, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({6, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_background)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3,
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 6, 4};  // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 0, 3, 1, 1};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.0, 10.0, 1.0, 11.0,
+                                                   1.00, 0.95, 0.0, 0.0, 1.0, 1.0,
+                                                   0.00, 0.9, 0.0, 0.0, 1.0, 1.0,
+                                                   1.00, 0.8, 0.0, 10.0, 1.0, 11.0,
+                                                   0.00, 0.13636364, 0.0, 0.1, 1.0, 1.1,
+                                                   1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int64_t> expected_valid_outputs = {6};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({6, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({6, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_flipped_coordinates)
+{
+    std::vector<float> boxes_data = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                     1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 6, 4}; // N 1, C 1, M 6
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 1};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.0, 10.0, 1.0, 11.0,
+                                                   0.00, 0.9, 1.0, 1.0, 0.0, 0.0,
+                                                   0.00, 0.75, 0.0, 0.1, 1.0, 1.1};
+    std::vector<int64_t> expected_valid_outputs = {3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({3, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({3, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_post_threshold)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.00;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 6, 4};  // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.8;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00,
+                                                   0.00, 0.9, 0.00, 0.00, 1.00, 1.00};
+    std::vector<int64_t> expected_valid_outputs = {2};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({2, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({2, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_identical_boxes)
+{
+    std::vector<float> boxes_data = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0,
+                                     1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,
+                                     0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0,
+                                     1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+
+    std::vector<float> scores_data = {0.4, 0.01, 0.2, 0.09, 0.15, 0.05, 0.02, 0.03, 0.05, 0.0};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 10, 4}; // N 1, C 1, M 10
+    const auto scores_shape = Shape{1, 1, 10};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.3;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0};
+    std::vector<float> expected_selected_scores = {0.00, 0.40, 0.00, 0.00, 1.00, 1.00};
+    std::vector<int64_t> expected_valid_outputs = {1};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({1, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({1, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_nms_top_k)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 2;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 6, 4};
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00 };
+    std::vector<int64_t> expected_valid_outputs = {2};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({2, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({2, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_single_box)
+{
+    std::vector<float> boxes_data = {0.0, 0.0, 1.0, 1.0};
+
+    std::vector<float> scores_data = {0.9};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 1, 4};
+    const auto scores_shape = Shape{1, 1, 1};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0};
+    std::vector<float> expected_selected_scores = {0.00, 0.90, 0.00, 0.00, 1.00, 1.00};
+    std::vector<int64_t> expected_valid_outputs = {1};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({1, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({1, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, matrix_nms_no_output)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.score_threshold = 2.0f;
+    attrs.sort_result_type = op::v8::MatrixNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+
+    const auto boxes_shape = Shape{1, 6, 4};
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    attrs.decay_function = op::v8::MatrixNms::DecayFunction::LINEAR;
+    attrs.gaussian_sigma = 2.0f;
+    attrs.post_threshold = 0.0f;
+
+    auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {};
+    std::vector<float> expected_selected_scores = {};
+    std::vector<int64_t> expected_valid_outputs = {0};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({0, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({0, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
diff --git a/ngraph/test/backend/multiclass_nms.in.cpp b/ngraph/test/backend/multiclass_nms.in.cpp
new file mode 100644
index 00000000000..3c0d3153765
--- /dev/null
+++ b/ngraph/test/backend/multiclass_nms.in.cpp
@@ -0,0 +1,802 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#ifdef ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
+#define DEFAULT_FLOAT_TOLERANCE_BITS ${BACKEND_NAME}_FLOAT_TOLERANCE_BITS
+#endif
+
+#ifdef ${BACKEND_NAME}_DOUBLE_TOLERANCE_BITS
+#define DEFAULT_DOUBLE_TOLERANCE_BITS ${BACKEND_NAME}_DOUBLE_TOLERANCE_BITS
+#endif
+// clang-format on
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_score)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4};  // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 0, 3};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,                                                    
+                                                    1.00, 0.95, 0.00, 0.00, 1.00, 1.00 ,                                                    
+                                                    0.00, 0.90, 0.00, 0.00, 1.00, 1.00,
+                                                    1.00, 0.80, 0.00, 10.00, 1.00, 11.00 };
+    std::vector<int64_t> expected_valid_outputs = {4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({4, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({4, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_class_id)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4}; // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 0, 3};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                    0.00, 0.90, 0.00, 0.00, 1.00, 1.00 ,
+                                                    1.00, 0.95, 0.00, 0.00, 1.00, 1.00 ,
+                                                    1.00, 0.80, 0.00, 10.00, 1.00, 11.00  };
+    std::vector<int64_t> expected_valid_outputs = {4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({4, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({4, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_output_type_i32)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+    attrs.output_type = element::i32;
+
+    const auto boxes_shape = Shape{1, 6, 4}; // N 1, C 2, M 6
+    const auto scores_shape = Shape{1, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int32_t> expected_selected_indices = {3, 0, 0, 3};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                    0.00, 0.90, 0.00, 0.00, 1.00, 1.00 ,
+                                                    1.00, 0.95, 0.00, 0.00, 1.00, 1.00 ,
+                                                    1.00, 0.80, 0.00, 10.00, 1.00, 11.00  };
+    std::vector<int32_t> expected_valid_outputs = {4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({4, 6}, expected_selected_scores);
+    test_case.add_expected_output<int32_t>({4, 1}, expected_selected_indices);
+    test_case.add_expected_output<int32_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_two_batches_two_classes_by_score)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 0, 3,
+                                                      9, 6, 6, 9};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00,     1.00, 0.80, 0.00, 10.00, 1.00, 11.00, // 0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00,     1.00, 0.80, 0.00, 10.00, 1.00, 11.00  }; // 1
+    std::vector<int64_t> expected_valid_outputs = {4, 4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({8, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({8, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_two_batches_two_classes_by_class_id)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1        
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 0, 3,
+                                                      9, 6, 6, 9};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   0.00, 0.90, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,     1.00, 0.80, 0.00, 10.00, 1.00, 11.00, // 0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   0.00, 0.90, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,     1.00, 0.80, 0.00, 10.00, 1.00, 11.00  }; // 1
+    std::vector<int64_t> expected_valid_outputs = {4, 4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({8, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({8, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_two_batches_two_classes_by_score_cross_batch)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+    attrs.sort_result_across_batch = true;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 9, 6,
+                                                      0, 6, 3, 9};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   //3
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,  //0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00, //9
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00, //6                                           
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //0
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //6
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00, //3                                                   
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00  }; // 9
+    std::vector<int64_t> expected_valid_outputs = {4, 4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({8, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({8, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_two_batches_two_classes_by_class_id_cross_batch)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1        
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+    attrs.sort_result_across_batch = true;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 9, 6,
+                                                      0, 3, 6, 9};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00, //3
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   //9
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00, //6
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,   //0 
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00, // 3                                                   
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,    //6 
+                                                   1.00, 0.80, 0.00, 10.00, 1.00, 11.00  }; // 9
+    std::vector<int64_t> expected_valid_outputs = {4, 4};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({8, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({8, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_flipped_coordinates)
+{
+    std::vector<float> boxes_data = {1.0, 1.0,  0.0, 0.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, 0.9,  1.0, -0.1, 0.0, 10.0,  1.0, 11.0,
+                                     1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4}; // N 1, C 1, M 6
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 1};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                   0.00, 0.90, 1.00, 1.00, 0.00, 0.00 ,
+                                                   0.00, 0.75, 0.00, 0.10, 1.00, 1.10};
+    std::vector<int64_t> expected_valid_outputs = {3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({3, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({3, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_identical_boxes)
+{
+    std::vector<float> boxes_data = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0,
+                                     1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,
+                                     0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0,
+                                     1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+
+    std::vector<float> scores_data = {0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 10, 4}; // N 1, C 1, M 10
+    const auto scores_shape = Shape{1, 1, 10};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0};
+    std::vector<float> expected_selected_scores = {0.00, 0.90, 0.00, 0.00, 1.00, 1.00};
+    std::vector<int64_t> expected_valid_outputs = {1};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({1, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({1, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_limit_output_size)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 2;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4};
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00 };
+    std::vector<int64_t> expected_valid_outputs = {2};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({2, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({2, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_single_box)
+{
+    std::vector<float> boxes_data = {0.0, 0.0, 1.0, 1.0};
+
+    std::vector<float> scores_data = {0.9};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 1, 4};
+    const auto scores_shape = Shape{1, 1, 1};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0};
+    std::vector<float> expected_selected_scores = {0.00, 0.90, 0.00, 0.00, 1.00, 1.00};
+    std::vector<int64_t> expected_valid_outputs = {1};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({1, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({1, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_IOU)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.2f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4};
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                   0.00, 0.90, 0.00, 0.00, 1.00, 1.00};
+    std::vector<int64_t> expected_valid_outputs = {2};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({2, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({2, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_IOU_and_scores)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.95f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4};
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00};
+    std::vector<int64_t> expected_valid_outputs = {1};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({1, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({1, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_no_output)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0};
+
+    std::vector<float> scores_data = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 2.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::SCORE;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{1, 6, 4};
+    const auto scores_shape = Shape{1, 1, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {};
+    std::vector<float> expected_selected_scores = {};
+    std::vector<int64_t> expected_valid_outputs = {0};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({0, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({0, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({1}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_background)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1        
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = 0;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {0, 3, 6, 9};
+    std::vector<float> expected_selected_scores = {1.00, 0.95, 0.00, 0.00, 1.00, 1.00,     1.00, 0.80, 0.00, 10.00, 1.00, 11.00, // 0
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,     1.00, 0.80, 0.00, 10.00, 1.00, 11.00  }; // 1
+    std::vector<int64_t> expected_valid_outputs = {2, 2};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({4, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({4, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_keep_top_k)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1        
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.iou_threshold = 0.5f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = 3;
+    attrs.background_class = -1;
+    attrs.nms_eta = 1.0f;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 0,
+                                                      9, 6, 6};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   0.00, 0.90, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00,     // 0
+                                                   0.00, 0.95, 0.00, 10.00, 1.00, 11.00,   0.00, 0.90, 0.00, 0.00, 1.00, 1.00,
+                                                   1.00, 0.95, 0.00, 0.00, 1.00, 1.00  };  // 1
+    std::vector<int64_t> expected_valid_outputs = {3, 3};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({6, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({6, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiclass_nms_by_nms_eta)
+{
+    std::vector<float> boxes_data = {0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, // 0
+                                     0.0, 0.0,  1.0, 1.0,  0.0, 0.1,   1.0, 1.1,
+                                     0.0, -0.1, 1.0, 0.9,  0.0, 10.0,  1.0, 11.0,
+                                     0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0 // 1                                   
+                                     };
+
+    std::vector<float> scores_data = {
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // 0
+        0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 
+        0.95, 0.75, 0.6, 0.80, 0.5, 0.3 // 1        
+        };
+
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = -1;
+    attrs.iou_threshold = 1.0f;
+    attrs.score_threshold = 0.0f;
+    attrs.sort_result_type = op::v8::MulticlassNms::SortResultType::CLASSID;
+    attrs.keep_top_k = -1;
+    attrs.background_class = -1;
+    attrs.nms_eta = 0.1f;
+
+    const auto boxes_shape = Shape{2, 6, 4};  // N 2, C 2, M 6
+    const auto scores_shape = Shape{2, 2, 6};
+
+    const auto boxes = make_shared<op::Parameter>(element::f32, boxes_shape);
+    const auto scores = make_shared<op::Parameter>(element::f32, scores_shape);
+    auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    auto f = make_shared<Function>(nms, ParameterVector{boxes, scores});
+
+    std::vector<int64_t> expected_selected_indices = {3, 0, 5, 0, 3, 5, 
+                                                      9, 6, 11, 6, 9, 11};
+    std::vector<float> expected_selected_scores = {0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                    0.00, 0.90, 0.00, 0.00, 1.00, 1.00 ,
+                                                    0.00, 0.30, 0.00, 100.00, 1.00, 101.00 ,
+                                                    1.00, 0.95, 0.00, 0.00, 1.00, 1.00 ,
+                                                    1.00, 0.80, 0.00, 10.00, 1.00, 11.00 ,
+                                                    1.00, 0.30, 0.00, 100.00, 1.00, 101.00 ,
+                                                    0.00, 0.95, 0.00, 10.00, 1.00, 11.00 ,
+                                                    0.00, 0.90, 0.00, 0.00, 1.00, 1.00 ,
+                                                    0.00, 0.30, 0.00, 100.00, 1.00, 101.00 ,
+                                                    1.00, 0.95, 0.00, 0.00, 1.00, 1.00 ,
+                                                    1.00, 0.80, 0.00, 10.00, 1.00, 11.00 ,
+                                                    1.00, 0.30, 0.00, 100.00, 1.00, 101.00 };
+    std::vector<int64_t> expected_valid_outputs = {6, 6};
+
+    auto test_case = test::TestCase<TestEngine, test::TestCaseType::DYNAMIC>(f);
+    test_case.add_multiple_inputs<float>({boxes_data, scores_data});
+    test_case.add_expected_output<float>({12, 6}, expected_selected_scores);
+    test_case.add_expected_output<int64_t>({12, 1}, expected_selected_indices);
+    test_case.add_expected_output<int64_t>({2}, expected_valid_outputs);
+    test_case.run();
+}
diff --git a/ngraph/test/backend/prior_box.in.cpp b/ngraph/test/backend/prior_box.in.cpp
new file mode 100644
index 00000000000..f79130ee3f9
--- /dev/null
+++ b/ngraph/test/backend/prior_box.in.cpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "gtest/gtest.h"
+
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/prior_box.hpp"
+
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, prior_box)
+{
+    op::PriorBoxAttrs attrs;
+    attrs.min_size = {2.0f};
+    attrs.aspect_ratio = {1.5f};
+    attrs.scale_all_sizes = false;
+
+    Shape layer_shape_shape{2};
+    Shape image_shape_shape{2};
+    vector<int64_t> layer_shape{2, 2};
+    vector<int64_t> image_shape{10, 10};
+
+    auto LS = op::Constant::create(element::i64, layer_shape_shape, layer_shape);
+    auto IS = op::Constant::create(element::i64, image_shape_shape, image_shape);
+    auto f = make_shared<Function>(make_shared<op::PriorBox>(LS, IS, attrs), ParameterVector{});
+    const auto exp_shape = Shape{2, 32};
+    vector<float> out{-0.75, -0.75, 1.25, 1.25, -0.974745, -0.566497,  1.47474, 1.0665,
+                      -0.25, -0.75, 1.75, 1.25, -0.474745, -0.566497,  1.97474, 1.0665,
+                      -0.75, -0.25, 1.25, 1.75, -0.974745, -0.0664966, 1.47474, 1.5665,
+                      -0.25, -0.25, 1.75, 1.75, -0.474745, -0.0664966, 1.97474, 1.5665,
+                      0.1,   0.1,   0.1,  0.1,  0.1,       0.1,        0.1,     0.1,
+                      0.1,   0.1,   0.1,  0.1,  0.1,       0.1,        0.1,     0.1,
+                      0.1,   0.1,   0.1,  0.1,  0.1,       0.1,        0.1,     0.1,
+                      0.1,   0.1,   0.1,  0.1,  0.1,       0.1,        0.1,     0.1};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_expected_output<float>(exp_shape, out);
+    test_case.run_with_tolerance_as_fp(1.0e-5f);
+}
diff --git a/ngraph/test/backend/sinh.in.cpp b/ngraph/test/backend/sinh.in.cpp
index 903a2bf3649..414c203766b 100644
--- a/ngraph/test/backend/sinh.in.cpp
+++ b/ngraph/test/backend/sinh.in.cpp
@@ -20,38 +20,86 @@
 // clang-format on
 
 #include "gtest/gtest.h"
-#include "runtime/backend.hpp"
-#include "ngraph/runtime/tensor.hpp"
 #include "ngraph/ngraph.hpp"
-#include "util/all_close.hpp"
-#include "util/all_close_f.hpp"
-#include "util/ndarray.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
 #include "util/test_control.hpp"
-#include "util/test_tools.hpp"
 
 using namespace std;
 using namespace ngraph;
 
 static string s_manifest = "${MANIFEST}";
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
 
 NGRAPH_TEST(${BACKEND_NAME}, sinh)
 {
-    Shape shape{6};
+    Shape shape{8};
     auto A = make_shared<op::Parameter>(element::f32, shape);
     auto f = make_shared<Function>(make_shared<op::Sinh>(A), ParameterVector{A});
 
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    vector<float> input{1.0f, 0.0f, -0.0f, -1.0f, 5.0f, -5.0f};
-    copy_data(a, input);
-    auto result = backend->create_tensor(element::f32, shape);
-
-    std::transform(
-        input.begin(), input.end(), input.begin(), [](float x) -> float { return sinhf(x); });
-
-    auto handle = backend->compile(f);
-    handle->call_with_validate({result}, {a});
-    EXPECT_TRUE(test::all_close_f(input, read_vector<float>(result)));
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({-4, -3, -2, -1, 0, 1, 2, 3});
+    test_case.add_expected_output<float>(
+        shape, {sinhf(-4), sinhf(-3), sinhf(-2), sinhf(-1), sinhf(0), sinhf(1), sinhf(2), sinhf(3)});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sinh_negative)
+{
+    Shape shape{5};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Sinh>(A), ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({-4, -3, -2, -1, -5});
+    test_case.add_expected_output<float>(
+        shape, {sinhf(-4), sinhf(-3), sinhf(-2), sinhf(-1), sinhf(-5)});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sinh_scalar)
+{
+    Shape shape{};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Sinh>(A), ParameterVector{A});
+
+    const vector<float> a{13};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({a});
+    test_case.add_expected_output<float>(shape, {sinhf(13)});
+    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 2);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sinh_in_place)
+{
+    Shape shape{2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto T = make_shared<op::Sinh>(A);
+    auto T2 = make_shared<op::Sinh>(T);
+
+    auto f = make_shared<Function>(T2, ParameterVector{A});
+
+    const vector<float> a{1, 3};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<float>({a});
+    test_case.add_expected_output<float>(shape, {sinhf(sinhf(1)), sinhf(sinhf(3))});
+    test_case.run(DEFAULT_FLOAT_TOLERANCE_BITS + 2);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sinh_i32)
+{
+    Shape shape{5};
+    auto A = make_shared<op::Parameter>(element::i32, shape);;
+    auto f = make_shared<Function>(make_shared<op::Sinh>(A), ParameterVector{A});
+
+    const vector<int32_t> input{2, 1, 0, -1, -2};
+    const vector<int32_t> expected{4, 1, 0, -1, -4};
+
+    auto test_case = test::TestCase<TestEngine>(f);
+    test_case.add_input<int32_t>({input});
+    test_case.add_expected_output<int32_t>(shape, {expected});
+    test_case.run();
 }
diff --git a/ngraph/test/backend/space_to_depth.in.cpp b/ngraph/test/backend/space_to_depth.in.cpp
new file mode 100644
index 00000000000..5cd32880178
--- /dev/null
+++ b/ngraph/test/backend/space_to_depth.in.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+#include "ngraph/op/space_to_depth.hpp"
+#include "util/engine/test_engines.hpp"
+#include "util/test_case.hpp"
+#include "util/test_control.hpp"
+
+using namespace ngraph;
+
+static std::string s_manifest = "${MANIFEST}";
+
+using TestEngine = test::ENGINE_CLASS_NAME(${BACKEND_NAME});
+
+NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_block_first_K2_BS2)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{1, 2, 4, 4});
+    const auto mode = op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST;
+    auto space_to_depth = std::make_shared<op::SpaceToDepth>(A, mode, 2);
+    auto function = std::make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({0.f,  1.f,  2.f,  3.f,  4.f,  5.f,  6.f,  7.f,  8.f,  9.f,  10.f,
+                                11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f,
+                                22.f, 23.f, 24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f});
+    test_case.add_expected_output<float>(
+        Shape{1, 8, 2, 2}, {0.f,  2.f,  8.f,  10.f, 16.f, 18.f, 24.f, 26.f, 1.f,  3.f,  9.f,
+                            11.f, 17.f, 19.f, 25.f, 27.f, 4.f,  6.f,  12.f, 14.f, 20.f, 22.f,
+                            28.f, 30.f, 5.f,  7.f,  13.f, 15.f, 21.f, 23.f, 29.f, 31.f});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_block_first_K2_BS3)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{1, 2, 6, 3});
+    const auto mode = op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST;
+    auto space_to_depth = std::make_shared<op::SpaceToDepth>(A, mode, 3);
+    auto function = std::make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({0.f, 4.f, 8.f,  12.f, 16.f, 20.f, 24.f, 28.f, 32.f,
+                                1.f, 5.f, 9.f,  13.f, 17.f, 21.f, 25.f, 29.f, 33.f,
+                                2.f, 6.f, 10.f, 14.f, 18.f, 22.f, 26.f, 30.f, 34.f,
+                                3.f, 7.f, 11.f, 15.f, 19.f, 23.f, 27.f, 31.f, 35.f});
+    test_case.add_expected_output<float>(Shape{1, 18, 2, 1},
+                                         {0.f,  1.f,  2.f,  3.f,  4.f,  5.f,  6.f,  7.f,  8.f,
+                                          9.f,  10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f,
+                                          18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f,
+                                          27.f, 28.f, 29.f, 30.f, 31.f, 32.f, 33.f, 34.f, 35.f});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_block_first_K1_BS3)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{1, 2, 6});
+    const auto mode = op::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST;
+    auto space_to_depth = std::make_shared<op::SpaceToDepth>(A, mode, 3);
+    auto function = std::make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({0.f, 4.f, 8.f, 1.f, 5.f, 9.f, 2.f, 6.f, 10.f, 3.f, 7.f, 11.f});
+    test_case.add_expected_output<float>(
+        Shape{1, 6, 2}, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_depth_first_K2_BS2)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{1, 2, 4, 4});
+    const auto mode = op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST;
+    auto space_to_depth = std::make_shared<op::SpaceToDepth>(A, mode, 2);
+    auto function = std::make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({0.f,  16.f, 2.f,  18.f, 1.f,  17.f, 3.f,  19.f, 8.f,  24.f, 10.f,
+                                26.f, 9.f,  25.f, 11.f, 27.f, 4.f,  20.f, 6.f,  22.f, 5.f,  21.f,
+                                7.f,  23.f, 12.f, 28.f, 14.f, 30.f, 13.f, 29.f, 15.f, 31.f});
+    test_case.add_expected_output<float>(
+        Shape{1, 8, 2, 2}, {0.f,  2.f,  8.f,  10.f, 16.f, 18.f, 24.f, 26.f, 1.f,  3.f,  9.f,
+                            11.f, 17.f, 19.f, 25.f, 27.f, 4.f,  6.f,  12.f, 14.f, 20.f, 22.f,
+                            28.f, 30.f, 5.f,  7.f,  13.f, 15.f, 21.f, 23.f, 29.f, 31.f});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_depth_first_K2_BS3)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{1, 2, 6, 3});
+    const auto mode = op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST;
+    auto space_to_depth = std::make_shared<op::SpaceToDepth>(A, mode, 3);
+    auto function = std::make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({0.f,  2.f,  4.f,  6.f,  8.f,  10.f, 12.f, 14.f, 16.f,
+                                1.f,  3.f,  5.f,  7.f,  9.f,  11.f, 13.f, 15.f, 17.f,
+                                18.f, 20.f, 22.f, 24.f, 26.f, 28.f, 30.f, 32.f, 34.f,
+                                19.f, 21.f, 23.f, 25.f, 27.f, 29.f, 31.f, 33.f, 35.f});
+    test_case.add_expected_output<float>(Shape{1, 18, 2, 1},
+                                         {0.f,  1.f,  2.f,  3.f,  4.f,  5.f,  6.f,  7.f,  8.f,
+                                          9.f,  10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f,
+                                          18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f, 26.f,
+                                          27.f, 28.f, 29.f, 30.f, 31.f, 32.f, 33.f, 34.f, 35.f});
+    test_case.run();
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, space_to_depth_depth_first_K1_BS3)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{1, 2, 6});
+    const auto mode = op::SpaceToDepth::SpaceToDepthMode::DEPTH_FIRST;
+    auto space_to_depth = std::make_shared<op::SpaceToDepth>(A, mode, 3);
+    auto function = std::make_shared<Function>(NodeVector{space_to_depth}, ParameterVector{A});
+
+    auto test_case = test::TestCase<TestEngine>(function);
+    test_case.add_input<float>({0.f, 2.f, 4.f, 1.f, 3.f, 5.f, 6.f, 8.f, 10.f, 7.f, 9.f, 11.f});
+    test_case.add_expected_output<float>(
+        Shape{1, 6, 2}, {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f});
+    test_case.run();
+}
diff --git a/ngraph/test/constant_folding.cpp b/ngraph/test/constant_folding.cpp
index 0f5ce320531..d7efe623708 100644
--- a/ngraph/test/constant_folding.cpp
+++ b/ngraph/test/constant_folding.cpp
@@ -567,16 +567,11 @@ TEST(constant_folding, shape_of_dynamic_v0)
     pass_manager.register_pass<pass::ConstantFolding>();
     pass_manager.run_passes(f);
 
-    ASSERT_EQ(count_ops_of_type<op::v0::ShapeOf>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::v1::Gather>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 8);
+    ASSERT_EQ(f->get_ops().size(), 3);
 
-    auto result_as_concat =
-        as_type_ptr<op::Concat>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
-    ASSERT_TRUE(result_as_concat);
-    ASSERT_EQ(result_as_concat->get_friendly_name(), "test");
-    ASSERT_EQ(result_as_concat->get_output_shape(0), Shape{7});
+    auto result_shape_of = f->get_results().at(0)->get_input_node_shared_ptr(0);
+    ASSERT_EQ(result_shape_of, shape_of);
+    ASSERT_EQ(result_shape_of->get_friendly_name(), "test");
 }
 
 TEST(constant_folding, shape_of_dynamic_v3)
@@ -592,17 +587,11 @@ TEST(constant_folding, shape_of_dynamic_v3)
     pass_manager.register_pass<pass::ConstantFolding>();
     pass_manager.run_passes(f);
 
-    ASSERT_EQ(count_ops_of_type<op::v3::ShapeOf>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::v1::Gather>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 8);
+    ASSERT_EQ(f->get_ops().size(), 3);
 
-    auto result_as_concat =
-        as_type_ptr<op::Concat>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
-    ASSERT_TRUE(result_as_concat);
-    ASSERT_EQ(result_as_concat->get_friendly_name(), "test");
-    ASSERT_EQ(result_as_concat->get_output_shape(0), Shape{7});
-    ASSERT_EQ(result_as_concat->get_output_element_type(0), element::i64);
+    auto result_shape_of = f->get_results().at(0)->get_input_node_shared_ptr(0);
+    ASSERT_EQ(result_shape_of, shape_of);
+    ASSERT_EQ(result_shape_of->get_friendly_name(), "test");
 }
 
 TEST(constant_folding, shape_of_dynamic_i32_v3)
@@ -618,17 +607,11 @@ TEST(constant_folding, shape_of_dynamic_i32_v3)
     pass_manager.register_pass<pass::ConstantFolding>();
     pass_manager.run_passes(f);
 
-    ASSERT_EQ(count_ops_of_type<op::v3::ShapeOf>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::v1::Gather>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 8);
+    ASSERT_EQ(f->get_ops().size(), 3);
 
-    auto result_as_concat =
-        as_type_ptr<op::Concat>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
-    ASSERT_TRUE(result_as_concat);
-    ASSERT_EQ(result_as_concat->get_friendly_name(), "test");
-    ASSERT_EQ(result_as_concat->get_output_shape(0), Shape{7});
-    ASSERT_EQ(result_as_concat->get_output_element_type(0), element::i32);
+    auto result_shape_of = f->get_results().at(0)->get_input_node_shared_ptr(0);
+    ASSERT_EQ(result_shape_of, shape_of);
+    ASSERT_EQ(result_shape_of->get_friendly_name(), "test");
 }
 
 // We need to be sure that constant folding won't be calculated endlessly.
@@ -646,16 +629,11 @@ TEST(constant_folding, shape_of_dynamic_double_folding_v0)
     pass_manager.run_passes(f);
     pass_manager.run_passes(f);
 
-    ASSERT_EQ(count_ops_of_type<op::v0::ShapeOf>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::v1::Gather>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 8);
+    ASSERT_EQ(f->get_ops().size(), 3);
 
-    auto result_as_concat =
-        as_type_ptr<op::Concat>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
-    ASSERT_TRUE(result_as_concat);
-    ASSERT_EQ(result_as_concat->get_friendly_name(), "test");
-    ASSERT_EQ(result_as_concat->get_output_shape(0), Shape{7});
+    auto result_shape_of = f->get_results().at(0)->get_input_node_shared_ptr(0);
+    ASSERT_EQ(result_shape_of, shape_of);
+    ASSERT_EQ(result_shape_of->get_friendly_name(), "test");
 }
 
 TEST(constant_folding, shape_of_dynamic_double_folding_v3)
@@ -672,16 +650,11 @@ TEST(constant_folding, shape_of_dynamic_double_folding_v3)
     pass_manager.run_passes(f);
     pass_manager.run_passes(f);
 
-    ASSERT_EQ(count_ops_of_type<op::v3::ShapeOf>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::v1::Gather>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Concat>(f), 1);
-    ASSERT_EQ(count_ops_of_type<op::Constant>(f), 8);
+    ASSERT_EQ(f->get_ops().size(), 3);
 
-    auto result_as_concat =
-        as_type_ptr<op::Concat>(f->get_results().at(0)->input_value(0).get_node_shared_ptr());
-    ASSERT_TRUE(result_as_concat);
-    ASSERT_EQ(result_as_concat->get_friendly_name(), "test");
-    ASSERT_EQ(result_as_concat->get_output_shape(0), Shape{7});
+    auto result_shape_of = f->get_results().at(0)->get_input_node_shared_ptr(0);
+    ASSERT_EQ(result_shape_of, shape_of);
+    ASSERT_EQ(result_shape_of->get_friendly_name(), "test");
 }
 
 // Constant folding will not succeed on ShapeOf if the argument rank is dynamic.
diff --git a/ngraph/test/coordinate.cpp b/ngraph/test/coordinate.cpp
index 9206b54a27c..55dc6c6dca9 100644
--- a/ngraph/test/coordinate.cpp
+++ b/ngraph/test/coordinate.cpp
@@ -15,6 +15,7 @@
 
 using namespace std;
 using namespace ngraph;
+NGRAPH_SUPPRESS_DEPRECATED_START
 
 TEST(coordinate, shape0d)
 {
diff --git a/ngraph/test/runtime/ie/unit_test.manifest b/ngraph/test/runtime/ie/unit_test.manifest
index 75d42a3a568..3483b054310 100644
--- a/ngraph/test/runtime/ie/unit_test.manifest
+++ b/ngraph/test/runtime/ie/unit_test.manifest
@@ -676,10 +676,6 @@ conv_bias_bprop_2d
 # Cannot cast ngraph node ConvolutionBiasAdd to CNNLayer!
 conv_bias_add_2d
 
-# Unsupported primitive of type: SpaceToBatch
-space_to_batch
-batch_to_space
-
 # [Validation] Argument must have rank >= 2 and <= 4 (argument shape: {1,2,2,2,3})
 normalize_across_1axis_5d
 normalize_across_123axes_5d
@@ -1109,6 +1105,41 @@ IE_CPU.onnx_model_nonmaxsuppression_center_point_box_format
 IE_CPU.onnx_model_nonmaxsuppression_single_box
 IE_CPU.nonmaxsuppression_suppress_by_IOU_and_scores_without_constants
 
+# Unsupported dynamic op
+IE_CPU.multiclass_nms_by_score
+IE_CPU.multiclass_nms_by_class_id
+IE_CPU.multiclass_nms_output_type_i32
+IE_CPU.multiclass_nms_two_batches_two_classes_by_score
+IE_CPU.multiclass_nms_two_batches_two_classes_by_class_id
+IE_CPU.multiclass_nms_two_batches_two_classes_by_score_cross_batch
+IE_CPU.multiclass_nms_two_batches_two_classes_by_class_id_cross_batch
+IE_CPU.multiclass_nms_no_output
+IE_CPU.multiclass_nms_by_background
+IE_CPU.multiclass_nms_by_keep_top_k
+IE_CPU.multiclass_nms_by_nms_eta
+IE_CPU.multiclass_nms_flipped_coordinates
+IE_CPU.multiclass_nms_identical_boxes
+IE_CPU.multiclass_nms_limit_output_size
+IE_CPU.multiclass_nms_single_box
+IE_CPU.multiclass_nms_by_IOU
+IE_CPU.multiclass_nms_by_IOU_and_scores
+
+# Unsupported dynamic op
+IE_CPU.matrix_nms_output_type_i64
+IE_CPU.matrix_nms_output_type_i32
+IE_CPU.matrix_nms_gaussian
+IE_CPU.matrix_nms_two_batches_two_classes
+IE_CPU.matrix_nms_two_batches_two_classes_by_score_cross_batch
+IE_CPU.matrix_nms_two_batches_two_classes_by_classid_cross_batch
+IE_CPU.matrix_nms_by_keep_top_k
+IE_CPU.matrix_nms_background
+IE_CPU.matrix_nms_flipped_coordinates
+IE_CPU.matrix_nms_post_threshold
+IE_CPU.matrix_nms_identical_boxes
+IE_CPU.matrix_nms_nms_top_k
+IE_CPU.matrix_nms_single_box
+IE_CPU.matrix_nms_no_output
+
 # Unsupported dynamic op
 IE_CPU.range_v4_trunc_inputs
 IE_CPU.onnx_model_reduce_sum_13_axes_as_input
@@ -1623,3 +1654,11 @@ IE_CPU.deformable_convolution_2D_integral_offsets_groups_and_deforgroups
 IE_CPU.deformable_convolution_2D_real_offsets_groups_basic
 IE_CPU.deformable_convolution_2D_real_offsets_groups_complex
 IE_CPU.deformable_convolution_2D_real_offsets_groups_and_deforgroups
+
+# No plugin support for AdaptiveAvgPool and AdaptiveMaxPool
+adaptive_avg_pool_1d
+adaptive_avg_pool_2d
+adaptive_avg_pool_3d
+adaptive_max_pool_1d
+adaptive_max_pool_2d
+adaptive_max_pool_3d
diff --git a/ngraph/test/runtime/interpreter/evaluates_map.cpp b/ngraph/test/runtime/interpreter/evaluates_map.cpp
index 168f02e4abf..6bad2c31fa4 100644
--- a/ngraph/test/runtime/interpreter/evaluates_map.cpp
+++ b/ngraph/test/runtime/interpreter/evaluates_map.cpp
@@ -8,6 +8,8 @@
 #include "ngraph/ops.hpp"
 
 #include <ngraph/runtime/reference/abs.hpp>
+#include <ngraph/runtime/reference/adaptive_avg_pool.hpp>
+#include <ngraph/runtime/reference/adaptive_max_pool.hpp>
 #include <ngraph/runtime/reference/avg_pool.hpp>
 #include <ngraph/runtime/reference/batch_norm.hpp>
 #include <ngraph/runtime/reference/binary_convolution.hpp>
@@ -47,7 +49,9 @@
 #include <ngraph/runtime/reference/log_softmax.hpp>
 #include <ngraph/runtime/reference/lrn.hpp>
 #include <ngraph/runtime/reference/lstm_cell.hpp>
+#include <ngraph/runtime/reference/matrix_nms.hpp>
 #include <ngraph/runtime/reference/mod.hpp>
+#include <ngraph/runtime/reference/multiclass_nms.hpp>
 #include <ngraph/runtime/reference/mvn.hpp>
 #include <ngraph/runtime/reference/non_max_suppression.hpp>
 #include <ngraph/runtime/reference/normalize_l2.hpp>
@@ -68,6 +72,7 @@
 #include <ngraph/runtime/reference/sign.hpp>
 #include <ngraph/runtime/reference/squared_difference.hpp>
 #include <ngraph/runtime/reference/tensor_iterator.hpp>
+#include <ngraph/runtime/reference/utils/nms_common.hpp>
 
 using namespace ngraph;
 using namespace std;
@@ -920,6 +925,285 @@ namespace
         return true;
     }
 
+    namespace matrix_nms_v8
+    {
+        using SortResultType = op::v8::MatrixNms::SortResultType;
+        struct InfoForNMS
+        {
+            Shape selected_outputs_shape;
+            Shape selected_indices_shape;
+            Shape boxes_shape;
+            Shape scores_shape;
+            std::vector<float> boxes_data;
+            std::vector<float> scores_data;
+            size_t selected_outputs_shape_size;
+            size_t selected_indices_shape_size;
+        };
+
+        constexpr size_t boxes_port = 0;
+        constexpr size_t scores_port = 1;
+
+        PartialShape
+            infer_selected_outputs_shape(const std::vector<std::shared_ptr<HostTensor>>& inputs,
+                                         int nms_top_k, int keep_top_k)
+        {
+            const auto boxes_ps = inputs[boxes_port]->get_partial_shape();
+            const auto scores_ps = inputs[scores_port]->get_partial_shape();
+
+            PartialShape result = {Dimension::dynamic(), 6};
+
+            if (boxes_ps.rank().is_static() && scores_ps.rank().is_static())
+            {
+                const auto num_boxes_boxes = boxes_ps[1];
+                if (num_boxes_boxes.is_static() && scores_ps[0].is_static() && scores_ps[1].is_static())
+                {
+                    const auto num_boxes = num_boxes_boxes.get_length();
+                    const auto num_classes = scores_ps[1].get_length();
+                    int64_t max_output_boxes_per_class = 0;
+                    if (nms_top_k >= 0)
+                        max_output_boxes_per_class = std::min(num_boxes, (int64_t)nms_top_k);
+                    else
+                        max_output_boxes_per_class = num_boxes;
+
+                    auto max_output_boxes_per_batch = max_output_boxes_per_class * num_classes;
+                    if (keep_top_k >= 0)
+                        max_output_boxes_per_batch =
+                            std::min(max_output_boxes_per_batch, (int64_t)keep_top_k);
+
+                    result[0] = max_output_boxes_per_batch * scores_ps[0].get_length();
+                }
+            }
+
+            return result;
+        }
+
+        std::vector<float> prepare_boxes_data(const std::shared_ptr<HostTensor>& boxes,
+                                              const Shape& boxes_shape)
+        {
+            auto result = get_floats(boxes, boxes_shape);
+            return result;
+        }
+
+        std::vector<float> prepare_scores_data(const std::shared_ptr<HostTensor>& scores,
+                                               const Shape& scores_shape)
+        {
+            auto result = get_floats(scores, scores_shape);
+            return result;
+        }
+
+        InfoForNMS get_info_for_nms_eval(const std::shared_ptr<op::v8::MatrixNms>& nms,
+                                           const std::vector<std::shared_ptr<HostTensor>>& inputs)
+        {
+            InfoForNMS result;
+
+            auto selected_outputs_shape =
+                infer_selected_outputs_shape(inputs, nms->get_nms_top_k(), nms->get_keep_top_k());
+            result.selected_outputs_shape = selected_outputs_shape.to_shape();
+            result.selected_indices_shape = {result.selected_outputs_shape[0], 1};
+
+            result.boxes_shape = inputs[boxes_port]->get_shape();
+            result.scores_shape = inputs[scores_port]->get_shape();
+
+            result.boxes_data = prepare_boxes_data(inputs[boxes_port], result.boxes_shape);
+            result.scores_data = prepare_scores_data(inputs[scores_port], result.scores_shape);
+
+            result.selected_outputs_shape_size = shape_size(result.selected_outputs_shape);
+            result.selected_indices_shape_size = shape_size(result.selected_indices_shape);
+
+            return result;
+        }
+    } // namespace matrix_nms_v8
+
+    template <element::Type_t ET>
+    bool evaluate(const shared_ptr<op::v8::MatrixNms>& op,
+                  const HostTensorVector& outputs,
+                  const HostTensorVector& inputs)
+    {
+        auto info = matrix_nms_v8::get_info_for_nms_eval(op, inputs);
+
+        std::vector<float> selected_outputs(info.selected_outputs_shape_size);
+        std::vector<int64_t> selected_indices(info.selected_indices_shape_size);
+        std::vector<int64_t> valid_outputs(info.boxes_shape[0]);
+
+        runtime::reference::matrix_nms(info.boxes_data.data(),
+                                                info.boxes_shape,
+                                                info.scores_data.data(),
+                                                info.scores_shape,
+                                                op->get_attrs(),
+                                                selected_outputs.data(),
+                                                info.selected_outputs_shape,
+                                                selected_indices.data(),
+                                                info.selected_indices_shape,
+                                                valid_outputs.data());
+
+        void* pscores = nullptr;
+        void* pselected_num = nullptr;
+        void* prois;
+        size_t num_selected = static_cast<size_t>(std::accumulate(valid_outputs.begin(), valid_outputs.end(), 0));
+
+        outputs[0]->set_shape({num_selected, 6});
+        prois = outputs[0]->get_data_ptr();
+
+        if (outputs.size() >= 2)
+        {
+            outputs[1]->set_shape({num_selected, 1});
+            pscores = outputs[1]->get_data_ptr();
+        }
+        if (outputs.size() >= 3)
+        {
+            pselected_num = outputs[2]->get_data_ptr();
+        }
+
+        runtime::reference::nms_common::nms_common_postprocessing(prois,
+                                                pscores,
+                                                pselected_num,
+                                                op->get_output_type(),
+                                                selected_outputs,
+                                                selected_indices,
+                                                valid_outputs);
+        return true;
+    }
+
+    namespace multiclass_nms_v8
+    {
+        using SortResultType = op::v8::MulticlassNms::SortResultType;
+        struct InfoForNMS
+        {
+            Shape selected_outputs_shape;
+            Shape selected_indices_shape;
+            Shape boxes_shape;
+            Shape scores_shape;
+            std::vector<float> boxes_data;
+            std::vector<float> scores_data;
+            size_t selected_outputs_shape_size;
+            size_t selected_indices_shape_size;
+        };
+
+        constexpr size_t boxes_port = 0;
+        constexpr size_t scores_port = 1;
+
+        PartialShape
+            infer_selected_outputs_shape(const std::vector<std::shared_ptr<HostTensor>>& inputs,
+                                         int nms_top_k, int keep_top_k)
+        {
+            const auto boxes_ps = inputs[boxes_port]->get_partial_shape();
+            const auto scores_ps = inputs[scores_port]->get_partial_shape();
+
+            PartialShape result = {Dimension::dynamic(), 6};
+
+            if (boxes_ps.rank().is_static() && scores_ps.rank().is_static())
+            {
+                const auto num_boxes_boxes = boxes_ps[1];
+                if (num_boxes_boxes.is_static() && scores_ps[0].is_static() && scores_ps[1].is_static())
+                {
+                    const auto num_boxes = num_boxes_boxes.get_length();
+                    const auto num_classes = scores_ps[1].get_length();
+                    int64_t max_output_boxes_per_class = 0;
+                    if (nms_top_k >= 0)
+                        max_output_boxes_per_class = std::min(num_boxes, (int64_t)nms_top_k);
+                    else
+                        max_output_boxes_per_class = num_boxes;
+
+                    auto max_output_boxes_per_batch = max_output_boxes_per_class * num_classes;
+                    if (keep_top_k >= 0)
+                        max_output_boxes_per_batch =
+                            std::min(max_output_boxes_per_batch, (int64_t)keep_top_k);
+
+                    result[0] = max_output_boxes_per_batch * scores_ps[0].get_length();
+                }
+            }
+
+            return result;
+        }
+
+        std::vector<float> prepare_boxes_data(const std::shared_ptr<HostTensor>& boxes,
+                                              const Shape& boxes_shape)
+        {
+            auto result = get_floats(boxes, boxes_shape);
+            return result;
+        }
+
+        std::vector<float> prepare_scores_data(const std::shared_ptr<HostTensor>& scores,
+                                               const Shape& scores_shape)
+        {
+            auto result = get_floats(scores, scores_shape);
+            return result;
+        }
+
+        InfoForNMS get_info_for_nms_eval(const std::shared_ptr<op::v8::MulticlassNms>& nms,
+                                           const std::vector<std::shared_ptr<HostTensor>>& inputs)
+        {
+            InfoForNMS result;
+
+            auto selected_outputs_shape =
+                infer_selected_outputs_shape(inputs, nms->get_nms_top_k(), nms->get_keep_top_k());
+            result.selected_outputs_shape = selected_outputs_shape.to_shape();
+            result.selected_indices_shape = {result.selected_outputs_shape[0], 1};
+
+            result.boxes_shape = inputs[boxes_port]->get_shape();
+            result.scores_shape = inputs[scores_port]->get_shape();
+
+            result.boxes_data = prepare_boxes_data(inputs[boxes_port], result.boxes_shape);
+            result.scores_data = prepare_scores_data(inputs[scores_port], result.scores_shape);
+
+            result.selected_outputs_shape_size = shape_size(result.selected_outputs_shape);
+            result.selected_indices_shape_size = shape_size(result.selected_indices_shape);
+
+            return result;
+        }
+    } // namespace multiclass_nms_v8
+
+    template <element::Type_t ET>
+    bool evaluate(const shared_ptr<op::v8::MulticlassNms>& op,
+                  const HostTensorVector& outputs,
+                  const HostTensorVector& inputs)
+    {
+        auto info = multiclass_nms_v8::get_info_for_nms_eval(op, inputs);
+
+        std::vector<float> selected_outputs(info.selected_outputs_shape_size);
+        std::vector<int64_t> selected_indices(info.selected_indices_shape_size);
+        std::vector<int64_t> valid_outputs(inputs[0]->get_shape()[0]);
+
+        runtime::reference::multiclass_nms(info.boxes_data.data(),
+                                                info.boxes_shape,
+                                                info.scores_data.data(),
+                                                info.scores_shape,
+                                                op->get_attrs(),
+                                                selected_outputs.data(),
+                                                info.selected_outputs_shape,
+                                                selected_indices.data(),
+                                                info.selected_indices_shape,
+                                                valid_outputs.data());                                                  
+
+        void* pscores = nullptr;
+        void* pselected_num = nullptr;
+        void* prois;
+        size_t num_selected = static_cast<size_t>(std::accumulate(valid_outputs.begin(), valid_outputs.end(), 0));
+
+        outputs[0]->set_shape({num_selected, 6});
+        prois = outputs[0]->get_data_ptr();
+
+        if (outputs.size() >= 2)
+        {
+            outputs[1]->set_shape({num_selected, 1});
+            pscores = outputs[1]->get_data_ptr();
+        }
+        if (outputs.size() >= 3)
+        {
+            pselected_num = outputs[2]->get_data_ptr();
+        }
+
+        runtime::reference::nms_common::nms_common_postprocessing(prois,
+                                                pscores,
+                                                pselected_num,
+                                                op->get_output_type(),
+                                                selected_outputs,
+                                                selected_indices,
+                                                valid_outputs);
+
+        return true;
+    }
+
     namespace experimental_prior_grid
     {
         struct InfoForEDPriorGrid
@@ -2539,6 +2823,33 @@ namespace
         return true;
     }
 
+    template <element::Type_t ET>
+    bool evaluate(const shared_ptr<op::v8::AdaptiveAvgPool>& op,
+                  const HostTensorVector& outputs,
+                  const HostTensorVector& inputs)
+    {
+        using T = typename element_type_traits<ET>::value_type;
+        runtime::reference::adaptive_avg_pool(inputs[0]->get_data_ptr<T>(),
+                                              outputs[0]->get_data_ptr<T>(),
+                                              inputs[0]->get_shape(),
+                                              op->get_output_shape(0));
+        return true;
+    }
+
+    template <element::Type_t ET>
+    bool evaluate(const shared_ptr<op::v8::AdaptiveMaxPool>& op,
+                  const HostTensorVector& outputs,
+                  const HostTensorVector& inputs)
+    {
+        using T = typename element_type_traits<ET>::value_type;
+        runtime::reference::adaptive_max_pool(inputs[0]->get_data_ptr<T>(),
+                                              outputs[0]->get_data_ptr<T>(),
+                                              outputs[1]->get_data_ptr<int64_t>(),
+                                              inputs[0]->get_shape(),
+                                              op->get_output_shape(0));
+        return true;
+    }
+
     template <typename T>
     bool evaluate_node(std::shared_ptr<Node> node,
                        const HostTensorVector& outputs,
@@ -2556,14 +2867,14 @@ namespace
         for (size_t i = 1; i < node->outputs().size(); i++)
         {
             if ((is_type<op::v5::NonMaxSuppression>(node) ||
-                 is_type<op::v6::ExperimentalDetectronDetectionOutput>(node)) && i == 1)
+                 is_type<op::v8::MulticlassNms>(node) ||
+                 is_type<op::v8::MatrixNms>(node) ||
+                 is_type<op::v6::ExperimentalDetectronDetectionOutput>(node) ||
+                 is_type<op::v8::AdaptiveMaxPool>(node)) &&
+                 i == 1)
             {
                 continue;
             }
-            if (element_type != node->get_output_element_type(i))
-            {
-                throw std::logic_error("Output node element types is not equal");
-            }
         }
         switch (element_type)
         {
diff --git a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
index f9e51669c9e..2893e716af5 100644
--- a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
+++ b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
@@ -96,3 +96,8 @@ NGRAPH_OP(DFT, op::v7)
 NGRAPH_OP(Einsum, op::v7)
 NGRAPH_OP(IDFT, op::v7)
 NGRAPH_OP(Roll, ngraph::op::v7)
+
+NGRAPH_OP(AdaptiveAvgPool, ngraph::op::v8)
+NGRAPH_OP(AdaptiveMaxPool, ngraph::op::v8)
+NGRAPH_OP(MatrixNms, op::v8)
+NGRAPH_OP(MulticlassNms, op::v8)
diff --git a/ngraph/test/type_prop/atan.cpp b/ngraph/test/type_prop/atan.cpp
new file mode 100644
index 00000000000..3b21f2a686d
--- /dev/null
+++ b/ngraph/test/type_prop/atan.cpp
@@ -0,0 +1,9 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unary_ops.hpp"
+
+using Type = ::testing::Types<ngraph::op::Atan>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(type_prop_atan, UnaryOperator, Type);
diff --git a/ngraph/test/type_prop/batch_to_space.cpp b/ngraph/test/type_prop/batch_to_space.cpp
index 885db5cd933..c324e72e270 100644
--- a/ngraph/test/type_prop/batch_to_space.cpp
+++ b/ngraph/test/type_prop/batch_to_space.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include <array>
+
 #include "gtest/gtest.h"
 #include "ngraph/ngraph.hpp"
 #include "util/type_prop.hpp"
@@ -9,18 +11,407 @@
 using namespace std;
 using namespace ngraph;
 
-TEST(type_prop, batch_to_space_output_shape_2D)
+namespace {
+    constexpr size_t data_input_idx = 0;
+    constexpr size_t block_shape_input_idx = 1;
+    constexpr size_t crops_begin_input_idx = 2;
+    constexpr size_t crops_end_input_idx = 3;
+    constexpr size_t batch_to_space_required_inputs = 4;
+    struct InputInfo
+    {
+        element::Type in_et;
+        PartialShape in_pshape;
+    };
+
+    using BatchToSpaceInputParams = std::array<InputInfo, batch_to_space_required_inputs>;
+
+    std::shared_ptr<Node> makeBatchToSpaceOp(const BatchToSpaceInputParams& p)
+    {
+        if(p.size() != batch_to_space_required_inputs)
+        {
+            throw runtime_error("BatchToSpace requires 4 inputs");
+        }
+        auto data = make_shared<op::Parameter>(
+            p.at(data_input_idx).in_et, p.at(data_input_idx).in_pshape);
+        auto block_shape = make_shared<op::Parameter>(
+            p.at(block_shape_input_idx).in_et,  p.at(block_shape_input_idx).in_pshape);
+        auto crops_begin = make_shared<op::Parameter>(
+            p.at(crops_begin_input_idx).in_et, p.at(crops_begin_input_idx).in_pshape);
+        auto crops_end = make_shared<op::Parameter>(
+            p.at(crops_end_input_idx).in_et, p.at(crops_end_input_idx).in_pshape);
+        return make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+    }
+} // namespace
+
+TEST(type_prop, batch_to_space_incompatible_input_element_types)
 {
-    auto data = make_shared<op::Parameter>(element::f32, Shape{10, 26});
-    auto block_shape = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{1, 5});
-    auto pads_begin = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{0, 2});
-    auto pads_end = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{0, 0});
+    element::Type float_et = element::f32;
+    element::Type integer64_et = element::i64;
+    element::Type integer32_et = element::i32;
 
-    auto batch_to_space =
-        make_shared<op::v1::BatchToSpace>(data, block_shape, pads_begin, pads_end);
+    Shape data_sshape{10, 26, 4, 4};
+    Shape inputs_sshape{4};
 
-    ASSERT_EQ(batch_to_space->get_element_type(), element::f32);
-    ASSERT_EQ(batch_to_space->get_shape(), (Shape{10 / 5, 26 * 5 - 2}));
+    vector<BatchToSpaceInputParams> test_cases;
+    test_cases.push_back(
+        BatchToSpaceInputParams{
+            InputInfo{float_et, data_sshape},
+            InputInfo{integer64_et, inputs_sshape},
+            InputInfo{integer32_et, inputs_sshape},
+            InputInfo{integer32_et, inputs_sshape}});
+
+    test_cases.push_back(
+        BatchToSpaceInputParams{
+            InputInfo{float_et, data_sshape},
+            InputInfo{integer32_et, inputs_sshape},
+            InputInfo{integer64_et, inputs_sshape},
+            InputInfo{integer32_et, inputs_sshape}});
+
+    test_cases.push_back(
+        BatchToSpaceInputParams{
+            InputInfo{float_et, data_sshape},
+            InputInfo{integer64_et, inputs_sshape},
+            InputInfo{float_et, inputs_sshape},
+            InputInfo{float_et, inputs_sshape}});
+
+    for (const auto& test_case : test_cases)
+    {
+        try
+        {
+            auto batch_to_space = makeBatchToSpaceOp(test_case);
+            FAIL() << "Incompatible element types for block_shape/crops_begin/crops_end inputs not detected";
+        }
+        catch(const NodeValidationFailure& error)
+        {
+            EXPECT_HAS_SUBSTRING(error.what(),
+            "block_shape, crops_begin and crops_end inputs must have same element type.");
+        }
+        catch (...)
+        {
+            FAIL() << "Element type check for block_shape/crops_begin/crops_end inputs failed for unexpected reason";
+        }
+    }
+}
+
+TEST(type_prop, batch_to_space_invalid_input_element_types)
+{
+    element::Type float_et = element::f32;
+
+    Shape data_sshape{10, 26, 4, 4};
+    Shape inputs_sshape{4};
+
+    const BatchToSpaceInputParams params{
+         InputInfo{float_et, data_sshape},
+         InputInfo{float_et, inputs_sshape},
+         InputInfo{float_et, inputs_sshape},
+         InputInfo{float_et, inputs_sshape}};
+
+    try
+    {
+        auto batch_to_space = makeBatchToSpaceOp(params);
+        FAIL() << "Invalid non-integer element type for block_shape/crops_begin/crops_end inputs not detected";
+    }
+    catch(const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+        "block_shape and crops inputs must have integer element type.");
+    }
+    catch (...)
+    {
+        FAIL() << "Element type check for block_shape/crops_begin/crops_end inputs failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_invalid_data_input_rank)
+{
+    Shape data_sshape{4, 2};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape{2};
+    element::Type inputs_et = element::i64;
+
+    const BatchToSpaceInputParams params{
+         InputInfo{data_et, data_sshape},
+         InputInfo{inputs_et, inputs_sshape},
+         InputInfo{inputs_et, inputs_sshape},
+         InputInfo{inputs_et, inputs_sshape}};
+
+    try
+    {
+        auto batch_to_space = makeBatchToSpaceOp(params);
+        FAIL() << "Invalid rank of data input not detected";
+    }
+    catch(const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "data input must have rank greater than or equal to 4");
+    }
+    catch (...)
+    {
+        FAIL() << "Rank check for data input failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_incompatible_secondary_inputs_shapes)
+{
+    Shape data_sshape{10, 26, 4, 4};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape_1D{4};
+    Shape inputs_sshape_2D{4, 1};
+    element::Type inputs_et = element::i64;
+
+    vector<BatchToSpaceInputParams> test_cases;
+    test_cases.push_back(
+        BatchToSpaceInputParams{
+             InputInfo{data_et, data_sshape},
+             InputInfo{inputs_et, inputs_sshape_2D},
+             InputInfo{inputs_et, inputs_sshape_1D},
+             InputInfo{inputs_et, inputs_sshape_1D}});
+
+    test_cases.push_back(
+        BatchToSpaceInputParams{
+             InputInfo{data_et, data_sshape},
+             InputInfo{inputs_et, inputs_sshape_1D},
+             InputInfo{inputs_et, inputs_sshape_2D},
+             InputInfo{inputs_et, inputs_sshape_1D}});
+
+    test_cases.push_back(
+        BatchToSpaceInputParams{
+             InputInfo{data_et, data_sshape},
+             InputInfo{inputs_et, inputs_sshape_1D},
+             InputInfo{inputs_et, inputs_sshape_2D},
+             InputInfo{inputs_et, inputs_sshape_2D}});
+
+    for (const auto& test_case : test_cases)
+    {
+        try
+        {
+            auto batch_to_space = makeBatchToSpaceOp(test_case);
+            FAIL() << "Incompatible shapes for block_shape/crops_begin/crops_end inputs not detected";
+        }
+        catch(const NodeValidationFailure& error)
+        {
+            EXPECT_HAS_SUBSTRING(error.what(),
+            "block_shape, crops_begin and crops_end inputs must have the same shape.");
+        }
+        catch (...)
+        {
+            FAIL() << "Shapes check for block_shape/crops_begin/crops_end inputs failed for unexpected reason";
+        }
+    }
+}
+
+TEST(type_prop, batch_to_space_invalid_secondary_inputs_rank)
+{
+    Shape data_sshape{10, 26, 4, 4};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape_2D{4, 1};
+    element::Type inputs_et = element::i64;
+
+    const BatchToSpaceInputParams params{
+         InputInfo{data_et, data_sshape},
+         InputInfo{inputs_et, inputs_sshape_2D},
+         InputInfo{inputs_et, inputs_sshape_2D},
+         InputInfo{inputs_et, inputs_sshape_2D}};
+
+    try
+    {
+        auto batch_to_space = makeBatchToSpaceOp(params);
+        FAIL() << "Invalid rank for block_shape/crops_begin/crops_end inputs not detected";
+    }
+    catch(const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+        "block_shape and crops inputs must have rank 1.");
+    }
+    catch (...)
+    {
+        FAIL() << "Rank check for block_shape/crops_begin/crops_end inputs failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_incompatible_data_and_secondary_inputs_shapes)
+{
+    Shape data_sshape{10, 26, 4, 4};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape{5};
+    element::Type inputs_et = element::i64;
+
+    const BatchToSpaceInputParams params{
+        InputInfo{data_et, data_sshape},
+        InputInfo{inputs_et, inputs_sshape},
+        InputInfo{inputs_et, inputs_sshape},
+        InputInfo{inputs_et, inputs_sshape}};
+
+    try
+    {
+        auto batch_to_space = makeBatchToSpaceOp(params);
+        FAIL() << "Incompatible shapes for data and block_shape/crops_begin/crops_end inputs not detected";
+    }
+    catch(const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+        "block_shape and crop inputs must have same number of elements "
+        "as data input rank.");
+    }
+    catch (...)
+    {
+        FAIL() << "Compatibility shape check for data and block_shape/crops_begin/crops_end inputs failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_invalid_block_shape_input)
+{
+    Shape data_sshape{100, 7, 13, 3};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape{4};
+    element::Type inputs_et = element::i64;
+
+    auto data = make_shared<op::Parameter>(data_et, data_sshape);
+    auto block_shape = make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 10, 5, 1});
+    auto crops_begin = make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 1, 0});
+    auto crops_end = make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 0, 0});
+
+    try
+    {
+        auto batch_to_space =
+        make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+        FAIL() << "Invalid elements of block_shape input not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+
+        EXPECT_HAS_SUBSTRING(error.what(),
+            "Elements of block_shape input must be greater or equal to one.");
+    }
+    catch (...)
+    {
+        FAIL() << "Greater than zero elements of block_shape input check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_invalid_crops_input_values)
+{
+    Shape data_sshape{100, 7, 13, 3};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape{4};
+    element::Type inputs_et = element::i64;
+
+    try
+    {
+        auto data = make_shared<op::Parameter>(data_et, data_sshape);
+        auto block_shape =
+            make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{1, 10, 5, 1});
+        auto crops_begin =
+            make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 1, -1});
+        auto crops_end =
+            make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 0, 0});
+        auto batch_to_space =
+            make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+        FAIL() << "Invalid crops_begin input values not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+            "Elements of crops_begin and crops_end inputs must be greater or equal to zero.");
+    }
+    catch (...)
+    {
+        FAIL() << "Non-negative element check of crops_begin input values failed for unexpected reason";
+    }
+
+    try
+    {
+        auto data = make_shared<op::Parameter>(data_et, data_sshape);
+        auto block_shape =
+            make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{1, 10, 5, 1});
+        auto crops_begin =
+            make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 1, 0});
+        auto crops_end =
+            make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, -1, 0});
+        auto batch_to_space =
+            make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+        FAIL() << "Invalid crops_end input values not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+            "Elements of crops_begin and crops_end inputs must be greater or equal to zero.");
+    }
+    catch (...)
+    {
+        FAIL() << "Non-negative element check of crops_end input values failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_incompatible_block_shape_input_values_with_data_shape)
+{
+    Shape data_sshape{80, 7, 13, 3};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape{4};
+    element::Type inputs_et = element::i64;
+
+    auto data = make_shared<op::Parameter>(data_et, data_sshape);
+    auto block_shape =
+        make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{1, 10, 5, 1});
+    auto crops_begin =
+        make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 1, 0});
+    auto crops_end =
+        make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 0, 0});
+
+    try
+    {
+        auto batch_to_space =
+            make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+        FAIL() << "Incompatible data shape and block_shape input values not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+            "The input data's 'batch' axis size: 80 must be a multiple of product of block_shape values: 50");
+    }
+    catch (...)
+    {
+        FAIL() << "Data shape and block_shape input values check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batch_to_space_invalid_crops_out_of_bounds)
+{
+    Shape data_sshape{32, 4, 1, 3};
+    element::Type data_et = element::f32;
+
+    Shape inputs_sshape{4};
+    element::Type inputs_et = element::i64;
+
+    auto data = make_shared<op::Parameter>(data_et, data_sshape);
+    auto block_shape =
+        make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{1, 2, 2, 1});
+    auto crops_begin =
+        make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 1, 2});
+    auto crops_end =
+        make_shared<op::Constant>(inputs_et, inputs_sshape, vector<int64_t>{0, 3, 0, 2});
+
+    try
+    {
+        auto batch_to_space =
+            make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+        FAIL() << "Invalid out of bound crops values not detected";
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+            "crops_begin[i] + crops_end[i] must be less or equal to block_shape[i] * input_shape[i]");
+    }
+    catch (...)
+    {
+        FAIL() << "Crops values check failed for unexpected reason";
+    }
 }
 
 TEST(type_prop, batch_to_space_output_shape_4D)
@@ -28,12 +419,12 @@ TEST(type_prop, batch_to_space_output_shape_4D)
     auto data = make_shared<op::Parameter>(element::f32, Shape{100, 7, 13, 3});
     auto block_shape =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{1, 10, 5, 1});
-    auto pads_begin =
+    auto crops_begin =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 1, 0});
-    auto pads_end = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 0, 0});
-
+    auto crops_end =
+        make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 0, 0});
     auto batch_to_space =
-        make_shared<op::v1::BatchToSpace>(data, block_shape, pads_begin, pads_end);
+        make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
 
     ASSERT_EQ(batch_to_space->get_element_type(), element::f32);
     ASSERT_EQ(batch_to_space->get_shape(), (Shape{100 / (10 * 5), 7 * 10 - 3 - 3, 13 * 5 - 1, 3}));
@@ -44,13 +435,12 @@ TEST(type_prop, batch_to_space_output_shape_5D)
     auto data = make_shared<op::Parameter>(element::f32, Shape{960, 6, 13, 128, 16});
     auto block_shape =
         make_shared<op::Constant>(element::i32, Shape{5}, vector<int64_t>{1, 6, 5, 1, 16});
-    auto pads_begin =
+    auto crops_begin =
         make_shared<op::Constant>(element::i32, Shape{5}, vector<int64_t>{0, 2, 0, 0, 0});
-    auto pads_end =
+    auto crops_end =
         make_shared<op::Constant>(element::i32, Shape{5}, vector<int64_t>{0, 2, 1, 0, 0});
-
     auto batch_to_space =
-        make_shared<op::v1::BatchToSpace>(data, block_shape, pads_begin, pads_end);
+        make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
 
     ASSERT_EQ(batch_to_space->get_element_type(), element::f32);
     ASSERT_EQ(batch_to_space->get_shape(),
@@ -62,19 +452,19 @@ TEST(type_prop, batch_to_space_and_space_to_batch)
     auto data = make_shared<op::Parameter>(element::f32, Shape{4800, 9, 11, 2});
     auto block_shape =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{1, 12, 100, 2});
-    auto pads_begin =
+    auto crops_begin =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 38, 1});
-    auto pads_end = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 5, 38, 0});
-
+    auto crops_end =
+        make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 5, 38, 0});
     auto batch_to_space =
-        make_shared<op::v1::BatchToSpace>(data, block_shape, pads_begin, pads_end);
+        make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
 
     ASSERT_EQ(batch_to_space->get_element_type(), element::f32);
     ASSERT_EQ(batch_to_space->get_shape(),
               (Shape{4800 / (12 * 100 * 2), 9 * 12 - 3 - 5, 11 * 100 - 38 - 38, 2 * 2 - 1}));
 
     auto space_to_batch =
-        make_shared<op::v1::SpaceToBatch>(batch_to_space, block_shape, pads_begin, pads_end);
+        make_shared<op::v1::SpaceToBatch>(batch_to_space, block_shape, crops_begin, crops_end);
     ASSERT_EQ(space_to_batch->get_element_type(), element::f32);
     ASSERT_EQ(space_to_batch->get_shape(), (Shape{4800, 9, 11, 2}));
 }
@@ -84,12 +474,12 @@ TEST(type_prop, batch_to_space_dynamic_shape_static_rank)
     auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic(4));
     auto block_shape =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{1, 10, 5, 1});
-    auto pads_begin =
+    auto crops_begin =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 1, 0});
-    auto pads_end = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 0, 0});
-
+    auto crops_end =
+        make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 0, 0});
     auto batch_to_space =
-        make_shared<op::v1::BatchToSpace>(data, block_shape, pads_begin, pads_end);
+        make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
 
     ASSERT_EQ(batch_to_space->get_element_type(), element::f32);
     ASSERT_EQ(batch_to_space->get_output_partial_shape(0), PartialShape::dynamic(4));
@@ -100,12 +490,12 @@ TEST(type_prop, batch_to_space_dynamic_shape_dynamic_rank)
     auto data = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
     auto block_shape =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{1, 10, 5, 1});
-    auto pads_begin =
+    auto crops_begin =
         make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 1, 0});
-    auto pads_end = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 0, 0});
-
+    auto crops_end =
+        make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 3, 0, 0});
     auto batch_to_space =
-        make_shared<op::v1::BatchToSpace>(data, block_shape, pads_begin, pads_end);
+        make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
 
     ASSERT_EQ(batch_to_space->get_element_type(), element::f32);
     ASSERT_EQ(batch_to_space->get_output_partial_shape(0), PartialShape::dynamic());
diff --git a/ngraph/test/type_prop/matrix_nms.cpp b/ngraph/test/type_prop/matrix_nms.cpp
new file mode 100644
index 00000000000..6817786cd1f
--- /dev/null
+++ b/ngraph/test/type_prop/matrix_nms.cpp
@@ -0,0 +1,263 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/type_prop.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+TEST(type_prop, matrix_nms_incorrect_boxes_rank)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Expected a 3D tensor for the 'boxes' input");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_scores_rank)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2});
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Expected a 3D tensor for the 'scores' input");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_scheme_num_batches)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 2, 3});
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The first dimension of both 'boxes' and 'scores' must match");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_scheme_num_boxes)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "'boxes' and 'scores' input shapes must match at the second and third "
+                             "dimension respectively");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_boxes_rank2)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 2, 2});
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The third dimension of the 'boxes' must be 4");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_output_type)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MatrixNms::Attributes attrs;
+        attrs.output_type = ngraph::element::f32;
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Output type must be i32 or i64");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_nms_topk)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MatrixNms::Attributes attrs;
+        attrs.nms_top_k = -2;
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'nms_top_k' must be great or equal -1");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_keep_topk)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MatrixNms::Attributes attrs;
+        attrs.keep_top_k = -2;
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'keep_top_k' must be great or equal -1");
+    }
+}
+
+TEST(type_prop, matrix_nms_incorrect_background_class)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MatrixNms::Attributes attrs;
+        attrs.background_class = -2;
+
+        make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'background_class' must be great or equal -1");
+    }
+}
+
+TEST(type_prop, matrix_nms_output_shape_1dim_dynamic)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{5, 2, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{5, 3, 2});
+
+    const auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+
+    ASSERT_TRUE(
+        nms->get_output_partial_shape(0).same_scheme(PartialShape{Dimension::dynamic(), 6}));
+    ASSERT_TRUE(
+        nms->get_output_partial_shape(1).same_scheme(PartialShape{Dimension::dynamic(), 1}));
+
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{5}));
+}
+
+TEST(type_prop, matrix_nms_output_shape_1dim_max_out)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+
+    const auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+
+    // batch * class * box
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 5 * 7), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 5 * 7), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, matrix_nms_output_shape_1dim_nms_topk)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+
+    const auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+    // batch * class * min(nms_topk, box)
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 5 * 3), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 5 * 3), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, matrix_nms_output_shape_1dim_keep_topk)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.keep_top_k = 8;
+
+    const auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+    // batch * min(keep_topk, class * box))
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 8), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 8), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, matrix_nms_output_shape_i32)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+    op::v8::MatrixNms::Attributes attrs;
+    attrs.output_type = ngraph::element::i32;
+
+    const auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, attrs);
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i32);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i32);
+    // batch * class * box
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 5 * 7), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 5 * 7), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, matrix_nms_dynamic_boxes_and_scores)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    const auto scores = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+
+    const auto nms = make_shared<op::v8::MatrixNms>(boxes, scores, op::v8::MatrixNms::Attributes());
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension::dynamic(), 6}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension::dynamic(), 1}));
+    EXPECT_EQ(nms->get_output_partial_shape(2), PartialShape({Dimension::dynamic()}));
+}
diff --git a/ngraph/test/type_prop/multiclass_nms.cpp b/ngraph/test/type_prop/multiclass_nms.cpp
new file mode 100644
index 00000000000..ad13d8e2dec
--- /dev/null
+++ b/ngraph/test/type_prop/multiclass_nms.cpp
@@ -0,0 +1,281 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/type_prop.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+TEST(type_prop, multiclass_nms_incorrect_boxes_rank)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Expected a 3D tensor for the 'boxes' input");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_scores_rank)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2});
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(), "Expected a 3D tensor for the 'scores' input");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_scheme_num_batches)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 2, 3});
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The first dimension of both 'boxes' and 'scores' must match");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_scheme_num_boxes)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "'boxes' and 'scores' input shapes must match at the second and third "
+                             "dimension respectively");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_boxes_rank2)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 2, 2});
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The third dimension of the 'boxes' must be 4");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_output_type)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MulticlassNms::Attributes attrs;
+        attrs.output_type = ngraph::element::f32;
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "Output type must be i32 or i64");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_nms_topk)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MulticlassNms::Attributes attrs;
+        attrs.nms_top_k = -2;
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'nms_top_k' must be great or equal -1");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_keep_topk)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MulticlassNms::Attributes attrs;
+        attrs.keep_top_k = -2;
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'keep_top_k' must be great or equal -1");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_background_class)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MulticlassNms::Attributes attrs;
+        attrs.background_class = -2;
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'background_class' must be great or equal -1");
+    }
+}
+
+TEST(type_prop, multiclass_nms_incorrect_eta)
+{
+    try
+    {
+        const auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 2, 4});
+        const auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2});
+        op::v8::MulticlassNms::Attributes attrs;
+        attrs.nms_eta = 2.0f;
+
+        make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+    }
+    catch (const NodeValidationFailure& error)
+    {
+        EXPECT_HAS_SUBSTRING(error.what(),
+                             "The 'nms_eta' must be in close range [0, 1.0]");
+    }
+}
+
+TEST(type_prop, multiclass_nms_output_shape_1dim_dynamic)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{5, 2, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{5, 3, 2});
+
+    const auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+
+    ASSERT_TRUE(
+        nms->get_output_partial_shape(0).same_scheme(PartialShape{Dimension::dynamic(), 6}));
+    ASSERT_TRUE(
+        nms->get_output_partial_shape(1).same_scheme(PartialShape{Dimension::dynamic(), 1}));
+
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{5}));
+}
+
+TEST(type_prop, multiclass_nms_output_shape_1dim_max_out)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+
+    const auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+
+    // batch * class * box
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 5 * 7), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 5 * 7), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, multiclass_nms_output_shape_1dim_nms_topk)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+
+    const auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+    // batch * class * min(nms_topk, box)
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 5 * 3), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 5 * 3), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, multiclass_nms_output_shape_1dim_keep_topk)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.nms_top_k = 3;
+    attrs.keep_top_k = 8;
+
+    const auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+    // batch * min(keep_topk, class * box))
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 8), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 8), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, multiclass_nms_output_shape_i32)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, Shape{2, 7, 4});
+    const auto scores = make_shared<op::Parameter>(element::f32, Shape{2, 5, 7});
+    op::v8::MulticlassNms::Attributes attrs;
+    attrs.output_type = ngraph::element::i32;
+
+    const auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, attrs);
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i32);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i32);
+    // batch * class * box
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension(0, 2 * 5 * 7), Dimension(6)}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension(0, 2 * 5 * 7), 1}));
+    EXPECT_EQ(nms->get_output_shape(2), (Shape{2}));
+}
+
+TEST(type_prop, multiclass_nms_dynamic_boxes_and_scores)
+{
+    const auto boxes = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+    const auto scores = make_shared<op::Parameter>(element::f32, PartialShape::dynamic());
+
+    const auto nms = make_shared<op::v8::MulticlassNms>(boxes, scores, op::v8::MulticlassNms::Attributes());
+
+    ASSERT_EQ(nms->get_output_element_type(0), element::f32);
+    ASSERT_EQ(nms->get_output_element_type(1), element::i64);
+    ASSERT_EQ(nms->get_output_element_type(2), element::i64);
+    EXPECT_EQ(nms->get_output_partial_shape(0), PartialShape({Dimension::dynamic(), 6}));
+    EXPECT_EQ(nms->get_output_partial_shape(1), PartialShape({Dimension::dynamic(), 1}));
+    EXPECT_EQ(nms->get_output_partial_shape(2), PartialShape({Dimension::dynamic()}));
+}
diff --git a/ngraph/test/type_prop/prior_box.cpp b/ngraph/test/type_prop/prior_box.cpp
new file mode 100644
index 00000000000..4304593844c
--- /dev/null
+++ b/ngraph/test/type_prop/prior_box.cpp
@@ -0,0 +1,51 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/prior_box.hpp"
+
+using namespace ngraph;
+
+TEST(type_prop, prior_box1)
+{
+    op::PriorBoxAttrs attrs;
+    attrs.min_size = {2.0f, 3.0f};
+    attrs.aspect_ratio = {1.5f, 2.0f, 2.5f};
+    attrs.scale_all_sizes = false;
+
+    auto layer_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {32, 32});
+    auto image_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {300, 300});
+    auto pb = std::make_shared<op::PriorBox>(layer_shape, image_shape, attrs);
+    ASSERT_EQ(pb->get_shape(), (Shape{2, 20480}));
+}
+
+TEST(type_prop, prior_box2)
+{
+    op::PriorBoxAttrs attrs;
+    attrs.min_size = {2.0f, 3.0f};
+    attrs.aspect_ratio = {1.5f, 2.0f, 2.5f};
+    attrs.flip = true;
+    attrs.scale_all_sizes = false;
+
+    auto layer_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {32, 32});
+    auto image_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {300, 300});
+    auto pb = std::make_shared<op::PriorBox>(layer_shape, image_shape, attrs);
+    ASSERT_EQ(pb->get_shape(), (Shape{2, 32768}));
+}
+
+TEST(type_prop, prior_box3)
+{
+    op::PriorBoxAttrs attrs;
+    attrs.min_size = {256.0f};
+    attrs.max_size = {315.0f};
+    attrs.aspect_ratio = {2.0f};
+    attrs.flip = true;
+
+    auto layer_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {1, 1});
+    auto image_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {300, 300});
+    auto pb = std::make_shared<op::PriorBox>(layer_shape, image_shape, attrs);
+    ASSERT_EQ(pb->get_shape(), (Shape{2, 16}));
+}
\ No newline at end of file
diff --git a/ngraph/test/type_prop/sinh.cpp b/ngraph/test/type_prop/sinh.cpp
new file mode 100644
index 00000000000..252fc35cdc2
--- /dev/null
+++ b/ngraph/test/type_prop/sinh.cpp
@@ -0,0 +1,9 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unary_ops.hpp"
+
+using Type = ::testing::Types<ngraph::op::Sinh>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(type_prop_sinh, UnaryOperator, Type);
diff --git a/ngraph/test/type_prop_layers.cpp b/ngraph/test/type_prop_layers.cpp
index fec662e75e0..efb6a49ee14 100644
--- a/ngraph/test/type_prop_layers.cpp
+++ b/ngraph/test/type_prop_layers.cpp
@@ -7,7 +7,6 @@
 #include "ngraph/ngraph.hpp"
 #include "ngraph/op/ctc_greedy_decoder.hpp"
 #include "ngraph/op/interpolate.hpp"
-#include "ngraph/op/prior_box.hpp"
 #include "ngraph/op/prior_box_clustered.hpp"
 #include "ngraph/op/region_yolo.hpp"
 #include "ngraph/op/reorg_yolo.hpp"
@@ -46,47 +45,6 @@ TEST(type_prop_layers, interpolate)
                     .same_scheme(PartialShape{2, 2, Dimension::dynamic(), Dimension::dynamic()}));
 }
 
-TEST(type_prop_layers, prior_box1)
-{
-    op::PriorBoxAttrs attrs;
-    attrs.min_size = {2.0f, 3.0f};
-    attrs.aspect_ratio = {1.5f, 2.0f, 2.5f};
-    attrs.scale_all_sizes = false;
-
-    auto layer_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {32, 32});
-    auto image_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {300, 300});
-    auto pb = make_shared<op::PriorBox>(layer_shape, image_shape, attrs);
-    ASSERT_EQ(pb->get_shape(), (Shape{2, 20480}));
-}
-
-TEST(type_prop_layers, prior_box2)
-{
-    op::PriorBoxAttrs attrs;
-    attrs.min_size = {2.0f, 3.0f};
-    attrs.aspect_ratio = {1.5f, 2.0f, 2.5f};
-    attrs.flip = true;
-    attrs.scale_all_sizes = false;
-
-    auto layer_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {32, 32});
-    auto image_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {300, 300});
-    auto pb = make_shared<op::PriorBox>(layer_shape, image_shape, attrs);
-    ASSERT_EQ(pb->get_shape(), (Shape{2, 32768}));
-}
-
-TEST(type_prop_layers, prior_box3)
-{
-    op::PriorBoxAttrs attrs;
-    attrs.min_size = {256.0f};
-    attrs.max_size = {315.0f};
-    attrs.aspect_ratio = {2.0f};
-    attrs.flip = true;
-
-    auto layer_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {1, 1});
-    auto image_shape = op::Constant::create<int64_t>(element::i64, Shape{2}, {300, 300});
-    auto pb = make_shared<op::PriorBox>(layer_shape, image_shape, attrs);
-    ASSERT_EQ(pb->get_shape(), (Shape{2, 16}));
-}
-
 TEST(type_prop_layers, prior_box_clustered)
 {
     op::PriorBoxClusteredAttrs attrs;
diff --git a/ngraph/test/visitors/op/atan.cpp b/ngraph/test/visitors/op/atan.cpp
new file mode 100644
index 00000000000..72b5931c72b
--- /dev/null
+++ b/ngraph/test/visitors/op/atan.cpp
@@ -0,0 +1,12 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unary_ops.hpp"
+
+using Types = ::testing::Types<UnaryOperatorType<ngraph::op::v0::Atan, element::f32>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(visitor_without_attribute,
+                              UnaryOperatorVisitor,
+                              Types,
+                              UnaryOperatorTypeName);
\ No newline at end of file
diff --git a/ngraph/test/visitors/op/batch_to_space.cpp b/ngraph/test/visitors/op/batch_to_space.cpp
new file mode 100644
index 00000000000..7e200bacafc
--- /dev/null
+++ b/ngraph/test/visitors/op/batch_to_space.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/util/attr_types.hpp"
+
+#include "util/visitor.hpp"
+
+using namespace std;
+using namespace ngraph;
+using ngraph::test::NodeBuilder;
+
+TEST(attributes, batch_to_space_op)
+{
+    NodeBuilder::get_ops().register_factory<op::v1::BatchToSpace>();
+    auto data = make_shared<op::Parameter>(element::f32, Shape{128, 4, 2, 2});
+    auto block_shape = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{1, 2, 2, 2});
+    auto crops_begin = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 2, 0, 1});
+    auto crops_end = make_shared<op::Constant>(element::i64, Shape{4}, vector<int64_t>{0, 0, 1, 0});
+    auto batch2space = make_shared<op::v1::BatchToSpace>(data, block_shape, crops_begin, crops_end);
+
+    NodeBuilder builder(batch2space);
+    const auto expected_attr_count = 0;
+
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
+}
diff --git a/ngraph/test/visitors/op/matrix_nms.cpp b/ngraph/test/visitors/op/matrix_nms.cpp
new file mode 100644
index 00000000000..7486e7791e2
--- /dev/null
+++ b/ngraph/test/visitors/op/matrix_nms.cpp
@@ -0,0 +1,101 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/util/attr_types.hpp"
+#include "ngraph/opsets/opset1.hpp"
+#include "ngraph/opsets/opset3.hpp"
+#include "ngraph/opsets/opset4.hpp"
+#include "ngraph/opsets/opset5.hpp"
+#include "ngraph/opsets/opset8.hpp"
+
+#include "util/visitor.hpp"
+
+using namespace std;
+using namespace ngraph;
+using ngraph::test::NodeBuilder;
+using ngraph::test::ValueMap;
+
+TEST(attributes, matrix_nms_v8_op_custom_attributes)
+{
+    NodeBuilder::get_ops().register_factory<opset8::MatrixNms>();
+    auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1});
+
+    opset8::MatrixNms::Attributes attrs;
+    attrs.sort_result_type = opset8::MatrixNms::SortResultType::SCORE;
+    attrs.output_type = ngraph::element::i32;
+    attrs.nms_top_k = 100;
+    attrs.keep_top_k = 10;
+    attrs.sort_result_across_batch = true;
+    attrs.score_threshold = 0.1f;
+    attrs.background_class = 2;
+    attrs.decay_function = opset8::MatrixNms::DecayFunction::GAUSSIAN;
+    attrs.gaussian_sigma = 0.2f;
+    attrs.post_threshold = 0.3f;
+    attrs.normalized = false;
+
+    auto nms = make_shared<opset8::MatrixNms>(boxes, scores, attrs);
+    NodeBuilder builder(nms);
+    auto g_nms = as_type_ptr<opset8::MatrixNms>(builder.create());
+    const auto expected_attr_count = 11;
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
+
+    auto& g_nms_attrs = g_nms->get_attrs();
+    auto& nms_attrs = nms->get_attrs();
+
+    EXPECT_EQ(g_nms_attrs.sort_result_type, nms_attrs.sort_result_type);
+    EXPECT_EQ(g_nms_attrs.output_type, nms_attrs.output_type);
+    EXPECT_EQ(g_nms_attrs.nms_top_k, nms_attrs.nms_top_k);
+    EXPECT_EQ(g_nms_attrs.keep_top_k, nms_attrs.keep_top_k);
+    EXPECT_EQ(g_nms_attrs.sort_result_across_batch, nms_attrs.sort_result_across_batch);
+    EXPECT_EQ(g_nms_attrs.score_threshold, nms_attrs.score_threshold);
+    EXPECT_EQ(g_nms_attrs.background_class, nms_attrs.background_class);
+    EXPECT_EQ(g_nms_attrs.decay_function, nms_attrs.decay_function);
+    EXPECT_EQ(g_nms_attrs.gaussian_sigma, nms_attrs.gaussian_sigma);
+    EXPECT_EQ(g_nms_attrs.post_threshold, nms_attrs.post_threshold);
+    EXPECT_EQ(g_nms_attrs.normalized, nms_attrs.normalized);
+
+    EXPECT_EQ(attrs.sort_result_type, nms_attrs.sort_result_type);
+    EXPECT_EQ(attrs.output_type, nms_attrs.output_type);
+    EXPECT_EQ(attrs.nms_top_k, nms_attrs.nms_top_k);
+    EXPECT_EQ(attrs.keep_top_k, nms_attrs.keep_top_k);
+    EXPECT_EQ(attrs.sort_result_across_batch, nms_attrs.sort_result_across_batch);
+    EXPECT_EQ(attrs.score_threshold, nms_attrs.score_threshold);
+    EXPECT_EQ(attrs.background_class, nms_attrs.background_class);
+    EXPECT_EQ(attrs.decay_function, nms_attrs.decay_function);
+    EXPECT_EQ(attrs.gaussian_sigma, nms_attrs.gaussian_sigma);
+    EXPECT_EQ(attrs.post_threshold, nms_attrs.post_threshold);
+    EXPECT_EQ(attrs.normalized, nms_attrs.normalized);
+}
+
+TEST(attributes, matrix_nms_v8_op_default_attributes)
+{
+    NodeBuilder::get_ops().register_factory<opset8::MatrixNms>();
+    auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1});
+
+    auto nms = make_shared<opset8::MatrixNms>(boxes, scores, opset8::MatrixNms::Attributes());
+    NodeBuilder builder(nms);
+    auto g_nms = as_type_ptr<opset8::MatrixNms>(builder.create());
+    const auto expected_attr_count = 11;
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
+
+    auto& g_nms_attrs = g_nms->get_attrs();
+    auto& nms_attrs = nms->get_attrs();
+
+    EXPECT_EQ(g_nms_attrs.sort_result_type, nms_attrs.sort_result_type);
+    EXPECT_EQ(g_nms_attrs.output_type, nms_attrs.output_type);
+    EXPECT_EQ(g_nms_attrs.nms_top_k, nms_attrs.nms_top_k);
+    EXPECT_EQ(g_nms_attrs.keep_top_k, nms_attrs.keep_top_k);
+    EXPECT_EQ(g_nms_attrs.sort_result_across_batch, nms_attrs.sort_result_across_batch);
+    EXPECT_EQ(g_nms_attrs.score_threshold, nms_attrs.score_threshold);
+    EXPECT_EQ(g_nms_attrs.background_class, nms_attrs.background_class);
+    EXPECT_EQ(g_nms_attrs.decay_function, nms_attrs.decay_function);
+    EXPECT_EQ(g_nms_attrs.gaussian_sigma, nms_attrs.gaussian_sigma);
+    EXPECT_EQ(g_nms_attrs.post_threshold, nms_attrs.post_threshold);
+    EXPECT_EQ(g_nms_attrs.normalized, nms_attrs.normalized);
+}
diff --git a/ngraph/test/visitors/op/multiclass_nms.cpp b/ngraph/test/visitors/op/multiclass_nms.cpp
new file mode 100644
index 00000000000..50bb9ec5e23
--- /dev/null
+++ b/ngraph/test/visitors/op/multiclass_nms.cpp
@@ -0,0 +1,97 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gtest/gtest.h"
+
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/util/attr_types.hpp"
+#include "ngraph/opsets/opset1.hpp"
+#include "ngraph/opsets/opset3.hpp"
+#include "ngraph/opsets/opset4.hpp"
+#include "ngraph/opsets/opset5.hpp"
+#include "ngraph/opsets/opset8.hpp"
+
+#include "util/visitor.hpp"
+
+using namespace std;
+using namespace ngraph;
+using ngraph::test::NodeBuilder;
+using ngraph::test::ValueMap;
+
+TEST(attributes, multiclass_nms_v8_op_custom_attributes)
+{
+    NodeBuilder::get_ops().register_factory<opset8::MulticlassNms>();
+    auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1});
+
+    opset8::MulticlassNms::Attributes attrs;
+    attrs.sort_result_type = opset8::MulticlassNms::SortResultType::SCORE;
+    attrs.sort_result_across_batch = true;
+    attrs.output_type = ngraph::element::i32;
+    attrs.nms_top_k = 100;
+    attrs.keep_top_k = 10;
+    attrs.iou_threshold = 0.1f;
+    attrs.score_threshold = 0.2f;
+    attrs.background_class = 2;
+    attrs.nms_eta = 0.3f;
+    attrs.normalized = false;
+
+    auto nms = make_shared<opset8::MulticlassNms>(boxes, scores, attrs);
+    NodeBuilder builder(nms);
+    auto g_nms = as_type_ptr<opset8::MulticlassNms>(builder.create());
+    const auto expected_attr_count = 10;
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
+
+    auto& g_nms_attrs = g_nms->get_attrs();
+    auto& nms_attrs = nms->get_attrs();
+
+    EXPECT_EQ(g_nms_attrs.sort_result_type, nms_attrs.sort_result_type);
+    EXPECT_EQ(g_nms_attrs.sort_result_across_batch, nms_attrs.sort_result_across_batch);
+    EXPECT_EQ(g_nms_attrs.output_type, nms_attrs.output_type);
+    EXPECT_EQ(g_nms_attrs.nms_top_k, nms_attrs.nms_top_k);
+    EXPECT_EQ(g_nms_attrs.keep_top_k, nms_attrs.keep_top_k);
+    EXPECT_EQ(g_nms_attrs.iou_threshold, nms_attrs.iou_threshold);
+    EXPECT_EQ(g_nms_attrs.score_threshold, nms_attrs.score_threshold);
+    EXPECT_EQ(g_nms_attrs.background_class, nms_attrs.background_class);
+    EXPECT_EQ(g_nms_attrs.nms_eta, nms_attrs.nms_eta);
+    EXPECT_EQ(g_nms_attrs.normalized, nms_attrs.normalized);
+
+    EXPECT_EQ(attrs.sort_result_type, nms_attrs.sort_result_type);
+    EXPECT_EQ(attrs.sort_result_across_batch, nms_attrs.sort_result_across_batch);
+    EXPECT_EQ(attrs.output_type, nms_attrs.output_type);
+    EXPECT_EQ(attrs.nms_top_k, nms_attrs.nms_top_k);
+    EXPECT_EQ(attrs.keep_top_k, nms_attrs.keep_top_k);
+    EXPECT_EQ(attrs.iou_threshold, nms_attrs.iou_threshold);
+    EXPECT_EQ(attrs.score_threshold, nms_attrs.score_threshold);
+    EXPECT_EQ(attrs.background_class, nms_attrs.background_class);
+    EXPECT_EQ(attrs.nms_eta, nms_attrs.nms_eta);
+    EXPECT_EQ(attrs.normalized, nms_attrs.normalized);
+}
+
+TEST(attributes, multiclass_nms_v8_op_default_attributes)
+{
+    NodeBuilder::get_ops().register_factory<opset8::MulticlassNms>();
+    auto boxes = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto scores = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1});
+
+    auto nms = make_shared<opset8::MulticlassNms>(boxes, scores, opset8::MulticlassNms::Attributes());
+    NodeBuilder builder(nms);
+    auto g_nms = as_type_ptr<opset8::MulticlassNms>(builder.create());
+    const auto expected_attr_count = 10;
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
+
+    auto& g_nms_attrs = g_nms->get_attrs();
+    auto& nms_attrs = nms->get_attrs();
+
+    EXPECT_EQ(g_nms_attrs.sort_result_type, nms_attrs.sort_result_type);
+    EXPECT_EQ(g_nms_attrs.sort_result_across_batch, nms_attrs.sort_result_across_batch);
+    EXPECT_EQ(g_nms_attrs.output_type, nms_attrs.output_type);
+    EXPECT_EQ(g_nms_attrs.nms_top_k, nms_attrs.nms_top_k);
+    EXPECT_EQ(g_nms_attrs.keep_top_k, nms_attrs.keep_top_k);
+    EXPECT_EQ(g_nms_attrs.iou_threshold, nms_attrs.iou_threshold);
+    EXPECT_EQ(g_nms_attrs.score_threshold, nms_attrs.score_threshold);
+    EXPECT_EQ(g_nms_attrs.background_class, nms_attrs.background_class);
+    EXPECT_EQ(g_nms_attrs.nms_eta, nms_attrs.nms_eta);
+    EXPECT_EQ(g_nms_attrs.normalized, nms_attrs.normalized);
+}
diff --git a/ngraph/test/visitors/op/prior_box.cpp b/ngraph/test/visitors/op/prior_box.cpp
index 3e0c5c706d2..0c4ce74aa4b 100644
--- a/ngraph/test/visitors/op/prior_box.cpp
+++ b/ngraph/test/visitors/op/prior_box.cpp
@@ -45,6 +45,8 @@ TEST(attributes, prior_box_op)
     const auto prior_box_attrs = prior_box->get_attrs();
     const auto g_prior_box_attrs = g_prior_box->get_attrs();
 
+    const auto expected_attr_count = 12;
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
     EXPECT_EQ(g_prior_box_attrs.min_size, prior_box_attrs.min_size);
     EXPECT_EQ(g_prior_box_attrs.max_size, prior_box_attrs.max_size);
     EXPECT_EQ(g_prior_box_attrs.aspect_ratio, prior_box_attrs.aspect_ratio);
diff --git a/ngraph/test/visitors/op/sinh.cpp b/ngraph/test/visitors/op/sinh.cpp
new file mode 100644
index 00000000000..db35d185c4a
--- /dev/null
+++ b/ngraph/test/visitors/op/sinh.cpp
@@ -0,0 +1,11 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unary_ops.hpp"
+using Type = ::testing::Types<UnaryOperatorType<ngraph::op::v0::Sinh, element::f32>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(visitor_without_attribute,
+                              UnaryOperatorVisitor,
+                              Type,
+                              UnaryOperatorTypeName);
diff --git a/ngraph/test/visitors/op/space_to_depth.cpp b/ngraph/test/visitors/op/space_to_depth.cpp
index da3cfd16177..0dff49e19ac 100644
--- a/ngraph/test/visitors/op/space_to_depth.cpp
+++ b/ngraph/test/visitors/op/space_to_depth.cpp
@@ -22,12 +22,19 @@ TEST(attributes, space_to_depth_op)
 {
     NodeBuilder::get_ops().register_factory<opset1::SpaceToDepth>();
     auto data = make_shared<op::Parameter>(element::i32, Shape{2, 3, 50, 50});
+
     auto block_size = 2;
     auto mode = opset1::SpaceToDepth::SpaceToDepthMode::BLOCKS_FIRST;
+
     auto space_to_depth = make_shared<opset1::SpaceToDepth>(data, mode, block_size);
     NodeBuilder builder(space_to_depth);
     auto g_space_to_depth = as_type_ptr<opset1::SpaceToDepth>(builder.create());
 
+    // attribute count
+    const auto expected_attr_count = 2;
+    EXPECT_EQ(builder.get_value_map_size(), expected_attr_count);
+
+    // space_to_depth attributes
     EXPECT_EQ(g_space_to_depth->get_block_size(), space_to_depth->get_block_size());
     EXPECT_EQ(g_space_to_depth->get_mode(), space_to_depth->get_mode());
 }
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 45395f910fc..ac094ce648b 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -10,6 +10,7 @@ ie_shellcheck_process(DIRECTORY "${OpenVINO_SOURCE_DIR}"
                       SKIP "${OpenVINO_SOURCE_DIR}/bin"
                            "${OpenVINO_SOURCE_DIR}/build"
                            "${OpenVINO_SOURCE_DIR}/thirdparty"
+                           "${OpenVINO_SOURCE_DIR}/ngraph/python/pybind11"
                            "${IE_MAIN_SOURCE_DIR}/thirdparty"
                            "${TEMP}"
                            # TODO fix and enable back:
diff --git a/scripts/demo/demo_benchmark_app.bat b/scripts/demo/demo_benchmark_app.bat
index 3ca0d6c7bdc..88928fddc67 100644
--- a/scripts/demo/demo_benchmark_app.bat
+++ b/scripts/demo/demo_benchmark_app.bat
@@ -6,6 +6,7 @@ setlocal enabledelayedexpansion
 
 set TARGET=CPU
 set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+set VENV_DIR=%USERPROFILE%\Documents\Intel\OpenVINO\venv_openvino
 
 :: command line arguments parsing
 :input_arguments_loop
@@ -21,10 +22,12 @@ if not "%1"=="" (
         shift
     )
     if "%1"=="-help" (
-        echo %~n0%~x0 is benchmark demo using public SqueezeNet topology
+        echo Benchmark demo using public SqueezeNet topology
         echo.
         echo Options:
-        echo -d name     Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        echo    -help                      Print help message
+        echo    -d DEVICE                  Specify the target device to infer on; CPU, GPU, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        echo    -sample-options OPTIONS    Specify command line arguments for the sample
         exit /b
     )
     shift
@@ -88,7 +91,21 @@ if not "%python_ver%"=="okay" (
 )
 
 :: install yaml python modules required for downloader.py
-pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if exist "%VENV_DIR%" (
+    echo.
+    echo ###############^|^| Using the existing python virtual environment ^|^|###############
+    echo.
+) else (
+    echo.
+    echo ###############^|^| Creating the python virtual environment ^|^|###############
+    echo.
+    python -m venv "%VENV_DIR%"
+)
+
+call "%VENV_DIR%\Scripts\activate.bat"
+python -m pip install -U pip
+python -m pip install -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+
 if ERRORLEVEL 1 GOTO errorHandling
 
 set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
@@ -121,8 +138,8 @@ echo.
 echo ###############^|^| Install Model Optimizer prerequisites ^|^|###############
 echo.
 CALL :delay 3
-cd /d "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\install_prerequisites"
-call install_prerequisites_caffe.bat
+cd /d "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer"
+python -m pip install -r requirements.txt
 if ERRORLEVEL 1 GOTO errorHandling
 
 CALL :delay 7
@@ -209,7 +226,7 @@ if "!MSBUILD_BIN!" == "" (
     GOTO errorHandling
 )
 
-set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_samples_build"
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_cpp_samples_build"
 
 echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
 if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
diff --git a/scripts/demo/demo_benchmark_app.sh b/scripts/demo/demo_benchmark_app.sh
index a287c406d2b..0c2869037c4 100755
--- a/scripts/demo/demo_benchmark_app.sh
+++ b/scripts/demo/demo_benchmark_app.sh
@@ -3,14 +3,21 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+echo -ne "\e[0;33mWARNING: If you get an error when running the demo in the Docker container, you may need to install additional packages. To do this, run the container as root (-u 0) and run install_openvino_dependencies.sh script. If you get a package-independent error, try setting additional parameters using -sample-options.\e[0m\n"
+
 ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
+VENV_DIR="$HOME/venv_openvino"
 
 . "$ROOT_DIR/utils.sh"
 
 usage() {
     echo "Benchmark demo using public SqueezeNet topology"
-    echo "-d name     specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
-    echo "-help            print help message"
+    echo
+    echo "Options:"
+    echo "  -help                     Print help message"
+    echo "  -d DEVICE                 Specify the target device to infer on; CPU, GPU, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+    echo "  -sample-options OPTIONS   Specify command line arguments for the sample"
+    echo
     exit 1
 }
 
@@ -50,7 +57,7 @@ fi
 
 target_precision="FP16"
 
-printf "target_precision = %s\n" ${target_precision}
+echo -ne "target_precision = ${target_precision}\n"
 
 models_path="$HOME/openvino_models/models"
 models_cache="$HOME/openvino_models/cache"
@@ -61,13 +68,11 @@ model_name="squeezenet1.1"
 target_image_path="$ROOT_DIR/car.png"
 
 run_again="Then run the script again\n\n"
-dashes="\n\n###################################################\n\n"
-
 
 if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
     setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
 else
-    printf "Error: setupvars.sh is not found\n"
+    echo -ne "Error: setupvars.sh is not found\n"
 fi
 
 if ! . "$setupvars_path" ; then
@@ -75,14 +80,6 @@ if ! . "$setupvars_path" ; then
     exit 1
 fi
 
-# Step 1. Download the Caffe model and the prototxt of the model
-echo -ne "${dashes}"
-printf "\n\nDownloading the Caffe model and the prototxt"
-
-cur_path=$PWD
-
-printf "\nInstalling dependencies\n"
-
 if [[ -f /etc/centos-release ]]; then
     DISTRO="centos"
 elif [[ -f /etc/lsb-release ]]; then
@@ -90,55 +87,27 @@ elif [[ -f /etc/lsb-release ]]; then
 fi
 
 if [[ $DISTRO == "centos" ]]; then
-    sudo -E yum install -y centos-release-scl epel-release
-    sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
-                           glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
-
-    sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
-    sudo -E yum install -y epel-release
-    sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
-
     # check installed Python version
     if command -v python3.5 >/dev/null 2>&1; then
         python_binary=python3.5
-        pip_binary=pip3.5
     fi
     if command -v python3.6 >/dev/null 2>&1; then
         python_binary=python3.6
-        pip_binary=pip3.6
-    fi
-    if [ -z "$python_binary" ]; then
-        sudo -E yum install -y rh-python36 || true
-        . scl_source enable rh-python36
-        python_binary=python3.6
-        pip_binary=pip3.6
     fi
 elif [[ $DISTRO == "ubuntu" ]]; then
-    sudo -E apt update
-    print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
     python_binary=python3
-    pip_binary=pip3
-
-    system_ver=$(grep -i "DISTRIB_RELEASE" -f /etc/lsb-release | cut -d "=" -f2)
-    if [ "$system_ver" = "16.04" ]; then
-        sudo -E apt-get install -y libpng12-dev
-    else
-        sudo -E apt-get install -y libpng-dev
-    fi
 elif [[ "$OSTYPE" == "darwin"* ]]; then
     # check installed Python version
-    if command -v python3.7 >/dev/null 2>&1; then
+    if command -v python3.8 >/dev/null 2>&1; then
+        python_binary=python3.8
+    elif command -v python3.7 >/dev/null 2>&1; then
         python_binary=python3.7
-        pip_binary=pip3.7
     elif command -v python3.6 >/dev/null 2>&1; then
         python_binary=python3.6
-        pip_binary=pip3.6
     elif command -v python3.5 >/dev/null 2>&1; then
         python_binary=python3.5
-        pip_binary=pip3.5
     else
         python_binary=python3
-        pip_binary=pip3
     fi
 fi
 
@@ -147,47 +116,52 @@ if ! command -v $python_binary &>/dev/null; then
     exit 1
 fi
 
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    $pip_binary install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+if [ -e "$VENV_DIR" ]; then
+    echo -ne "\n###############|| Using the existing python virtual environment ||###############\n\n"
 else
-    sudo -E "$pip_binary" install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+    echo -ne "\n###############|| Creating the python virtual environment ||###############\n\n"
+    "$python_binary" -m venv "$VENV_DIR"
 fi
 
+. "$VENV_DIR/bin/activate"
+python -m pip install -U pip
+python -m pip install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+
+# Step 1. Download the Caffe model and the prototxt of the model
+echo -ne "\n###############|| Downloading the Caffe model and the prototxt ||###############\n\n"
+
 downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
 
-model_dir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
-    "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+model_dir=$(python "$downloader_dir/info_dumper.py" --name "$model_name" |
+    python -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
 
 downloader_path="$downloader_dir/downloader.py"
 
-print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
+print_and_run python "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
 
 ir_dir="${irs_path}/${model_dir}/${target_precision}"
 
 if [ ! -e "$ir_dir" ]; then
     # Step 2. Configure Model Optimizer
-    echo -ne "${dashes}"
-    printf "Install Model Optimizer dependencies\n\n"
-    cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/install_prerequisites"
-    . ./install_prerequisites.sh caffe
-    cd "$cur_path"
+    echo -ne "\n###############|| Install Model Optimizer dependencies ||###############\n\n"
+    cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer"
+    python -m pip install -r requirements.txt
+    cd "$PWD"
 
     # Step 3. Convert a model with Model Optimizer
-    echo -ne "${dashes}"
-    printf "Convert a model with Model Optimizer\n\n"
+    echo -ne "\n###############|| Convert a model with Model Optimizer ||###############\n\n"
 
     mo_path="${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/mo.py"
 
     export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
-    print_and_run "$python_binary" "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
+    print_and_run python "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
 else
-    printf "\n\nTarget folder %s already exists. Skipping IR generation  with Model Optimizer." "${ir_dir}"
+    echo -ne "\n\nTarget folder ${ir_dir} already exists. Skipping IR generation  with Model Optimizer."
     echo -ne "If you want to convert a model again, remove the entire ${ir_dir} folder. ${run_again}"
 fi
 
 # Step 4. Build samples
-echo -ne "${dashes}"
-printf "Build Inference Engine samples\n\n"
+echo -ne "\n###############|| Build Inference Engine samples ||###############\n\n"
 
 OS_PATH=$(uname -m)
 NUM_THREADS="-j2"
@@ -198,7 +172,7 @@ if [ "$OS_PATH" == "x86_64" ]; then
 fi
 
 samples_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/samples/cpp"
-build_dir="$HOME/inference_engine_samples_build"
+build_dir="$HOME/inference_engine_cpp_samples_build"
 binaries_dir="${build_dir}/${OS_PATH}/Release"
 
 if [ -e "$build_dir/CMakeCache.txt" ]; then
@@ -211,8 +185,7 @@ cmake -DCMAKE_BUILD_TYPE=Release "$samples_path"
 make $NUM_THREADS benchmark_app
 
 # Step 5. Run samples
-echo -ne "${dashes}"
-printf "Run Inference Engine benchmark app\n\n"
+echo -ne "\n###############|| Run Inference Engine benchmark app ||###############\n\n"
 
 cd "$binaries_dir"
 
@@ -220,6 +193,4 @@ cp -f "$ROOT_DIR/${model_name}.labels" "${ir_dir}/"
 
 print_and_run ./benchmark_app -d "$target" -i "$target_image_path" -m "${ir_dir}/${model_name}.xml" -pc "${sampleoptions[@]}"
 
-echo -ne "${dashes}"
-
-printf "Inference Engine benchmark app completed successfully.\n\n"
+echo -ne "\n###############|| Inference Engine benchmark app completed successfully ||###############\n\n"
diff --git a/scripts/demo/demo_security_barrier_camera.bat b/scripts/demo/demo_security_barrier_camera.bat
index 6e2e1b99bc6..21db90e6d54 100644
--- a/scripts/demo/demo_security_barrier_camera.bat
+++ b/scripts/demo/demo_security_barrier_camera.bat
@@ -7,6 +7,7 @@ setlocal enabledelayedexpansion
 set TARGET=CPU
 set SAMPLE_OPTIONS=
 set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+set VENV_DIR=%USERPROFILE%\Documents\Intel\OpenVINO\venv_openvino
 
 :: command line arguments parsing
 :input_arguments_loop
@@ -22,10 +23,12 @@ if not "%1"=="" (
         shift
     )
     if "%1"=="-help" (
-        echo %~n0%~x0 is security barrier camera demo that showcases three models coming with the product
+        echo Security barrier camera demo that showcases three models coming with the product
         echo.
         echo Options:
-        echo -d name     Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        echo    -help                      Print help message
+        echo    -d DEVICE                  Specify the target device to infer on; CPU, GPU, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        echo    -sample-options OPTIONS    Specify command line arguments for the sample
         exit /b
     )
     shift
@@ -78,9 +81,22 @@ if not "%python_ver%"=="okay" (
 )
 
 :: install yaml python modules required for downloader.py
-pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
-if ERRORLEVEL 1 GOTO errorHandling
+if exist "%VENV_DIR%" (
+    echo.
+    echo ###############^|^| Using the existing python virtual environment ^|^|###############
+    echo.
+) else (
+    echo.
+    echo ###############^|^| Creating the python virtual environment ^|^|###############
+    echo.
+    python -m venv "%VENV_DIR%"
+)
 
+call "%VENV_DIR%\Scripts\activate.bat"
+python -m pip install -U pip
+python -m pip install -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+
+if ERRORLEVEL 1 GOTO errorHandling
 
 set models_path=%BUILD_FOLDER%\openvino_models\ir
 set models_cache=%BUILD_FOLDER%\openvino_models\cache
diff --git a/scripts/demo/demo_security_barrier_camera.sh b/scripts/demo/demo_security_barrier_camera.sh
index 36e09cab7ba..982e7f57f3c 100755
--- a/scripts/demo/demo_security_barrier_camera.sh
+++ b/scripts/demo/demo_security_barrier_camera.sh
@@ -3,14 +3,21 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+echo -ne "\e[0;33mWARNING: If you get an error when running the demo in the Docker container, you may need to install additional packages. To do this, run the container as root (-u 0) and run install_openvino_dependencies.sh script. If you get a package-independent error, try setting additional parameters using -sample-options.\e[0m\n"
+
 ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
+VENV_DIR="$HOME/venv_openvino"
 
 . "$ROOT_DIR/utils.sh"
 
 usage() {
     echo "Security barrier camera demo that showcases three models coming with the product"
-    echo "-d name     specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
-    echo "-help            print help message"
+    echo
+    echo "Options:"
+    echo "  -help                     Print help message"
+    echo "  -d DEVICE                 Specify the target device to infer on; CPU, GPU, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+    echo "  -sample-options OPTIONS   Specify command line arguments for the sample"
+    echo
     exit 1
 }
 
@@ -44,11 +51,19 @@ esac
 shift
 done
 
-
 target_image_path="$ROOT_DIR/car_1.bmp"
 
 run_again="Then run the script again\n\n"
-dashes="\n\n###################################################\n\n"
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+    setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+    echo -ne "Error: setupvars.sh is not found\n"
+fi
+if ! . "$setupvars_path" ; then
+    echo -ne "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+    exit 1
+fi
 
 if [[ -f /etc/centos-release ]]; then
     DISTRO="centos"
@@ -59,55 +74,27 @@ elif [[ "$OSTYPE" == "darwin"* ]]; then
 fi
 
 if [[ $DISTRO == "centos" ]]; then
-    sudo -E yum install -y centos-release-scl epel-release
-    sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
-                           glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
-
-    sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
-    sudo -E yum install -y epel-release
-    sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
-
     # check installed Python version
     if command -v python3.5 >/dev/null 2>&1; then
         python_binary=python3.5
-        pip_binary=pip3.5
     fi
     if command -v python3.6 >/dev/null 2>&1; then
         python_binary=python3.6
-        pip_binary=pip3.6
-    fi
-    if [ -z "$python_binary" ]; then
-        sudo -E yum install -y rh-python36 || true
-        . scl_source enable rh-python36
-        python_binary=python3.6
-        pip_binary=pip3.6
     fi
 elif [[ $DISTRO == "ubuntu" ]]; then
-    sudo -E apt update
-    print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
     python_binary=python3
-    pip_binary=pip3
-
-    system_ver=$(grep -i "DISTRIB_RELEASE" -f /etc/lsb-release | cut -d "=" -f2)
-    if [ "$system_ver" = "16.04" ]; then
-        sudo -E apt-get install -y libpng12-dev
-    else
-        sudo -E apt-get install -y libpng-dev
-    fi
 elif [[ "$OSTYPE" == "darwin"* ]]; then
     # check installed Python version
-    if command -v python3.7 >/dev/null 2>&1; then
+    if command -v python3.8 >/dev/null 2>&1; then
+        python_binary=python3.8
+    elif command -v python3.7 >/dev/null 2>&1; then
         python_binary=python3.7
-        pip_binary=pip3.7
     elif command -v python3.6 >/dev/null 2>&1; then
         python_binary=python3.6
-        pip_binary=pip3.6
     elif command -v python3.5 >/dev/null 2>&1; then
         python_binary=python3.5
-        pip_binary=pip3.5
     else
         python_binary=python3
-        pip_binary=pip3
     fi
 fi
 
@@ -116,30 +103,23 @@ if ! command -v $python_binary &>/dev/null; then
     exit 1
 fi
 
-if [[ $DISTRO == "macos" ]]; then
-    "$pip_binary" install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+if [ -e "$VENV_DIR" ]; then
+    echo -ne "\n###############|| Using the existing python virtual environment ||###############\n\n"
 else
-    sudo -E "$pip_binary" install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+    echo -ne "\n###############|| Creating the python virtual environment ||###############\n\n"
+    "$python_binary" -m venv "$VENV_DIR"
 fi
 
-if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
-    setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
-else
-    printf "Error: setupvars.sh is not found\n"
-fi
-if ! . "$setupvars_path" ; then
-    echo -ne "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
-    exit 1
-fi
+. "$VENV_DIR/bin/activate"
+python -m pip install -U pip
+python -m pip install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
 
 # Step 1. Downloading Intel models
-echo -ne "${dashes}"
-printf "Downloading Intel models\n\n"
-
+echo -ne "\n###############|| Downloading Intel models ||###############\n\n"
 
 target_precision="FP16"
 
-printf "target_precision = %s\n" "${target_precision}"
+echo -ne "target_precision = ${target_precision}\n"
 
 downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
 
@@ -150,19 +130,18 @@ models_cache="$HOME/openvino_models/cache"
 declare -a model_args
 
 while read -r model_opt model_name; do
-    model_subdir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
-        "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+    model_subdir=$(python "$downloader_dir/info_dumper.py" --name "$model_name" |
+        python -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
 
     model_path="$models_path/$model_subdir/$target_precision/$model_name"
 
-    print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "$models_path" --cache_dir "$models_cache"
+    print_and_run python "$downloader_path" --name "$model_name" --output_dir "$models_path" --cache_dir "$models_cache"
 
     model_args+=("$model_opt" "${model_path}.xml")
 done < "$ROOT_DIR/demo_security_barrier_camera.conf"
 
 # Step 2. Build samples
-echo -ne "${dashes}"
-printf "Build Inference Engine demos\n\n"
+echo -ne "\n###############|| Build Inference Engine demos ||###############\n\n"
 
 demos_path="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/demos"
 
@@ -189,13 +168,11 @@ cmake -DCMAKE_BUILD_TYPE=Release "$demos_path"
 make $NUM_THREADS security_barrier_camera_demo
 
 # Step 3. Run samples
-echo -ne "${dashes}"
-printf "Run Inference Engine security_barrier_camera demo\n\n"
+echo -ne "\n###############|| Run Inference Engine security_barrier_camera demo ||###############\n\n"
 
 binaries_dir="${build_dir}/${OS_PATH}/Release"
 cd "$binaries_dir"
 
 print_and_run ./security_barrier_camera_demo -d "$target" -d_va "$target" -d_lpr "$target" -i "$target_image_path" "${model_args[@]}" "${sampleoptions[@]}"
 
-echo -ne "${dashes}"
-printf "Demo completed successfully.\n\n"
+echo -ne "\n###############|| Demo completed successfully ||###############\n\n"
diff --git a/scripts/demo/demo_squeezenet_download_convert_run.bat b/scripts/demo/demo_squeezenet_download_convert_run.bat
index ad317ab4fd9..c0bd34614da 100644
--- a/scripts/demo/demo_squeezenet_download_convert_run.bat
+++ b/scripts/demo/demo_squeezenet_download_convert_run.bat
@@ -6,6 +6,7 @@ setlocal enabledelayedexpansion
 
 set TARGET=CPU
 set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+set VENV_DIR=%USERPROFILE%\Documents\Intel\OpenVINO\venv_openvino
 
 :: command line arguments parsing
 :input_arguments_loop
@@ -21,10 +22,12 @@ if not "%1"=="" (
         shift
     )
     if "%1"=="-help" (
-        echo %~n0%~x0 is classification demo using public SqueezeNet topology
+        echo Classification demo using public SqueezeNet topology
         echo.
         echo Options:
-        echo -d name     Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        echo    -help                      Print help message
+        echo    -d DEVICE                  Specify the target device to infer on; CPU, GPU, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        echo    -sample-options OPTIONS    Specify command line arguments for the sample
         exit /b
     )
     shift
@@ -84,7 +87,21 @@ if not "%python_ver%"=="okay" (
 )
 
 :: install yaml python modules required for downloader.py
-pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if exist "%VENV_DIR%" (
+    echo.
+    echo ###############^|^| Using the existing python virtual environment ^|^|###############
+    echo.
+) else (
+    echo.
+    echo ###############^|^| Creating the python virtual environment ^|^|###############
+    echo.
+    python -m venv "%VENV_DIR%"
+)
+
+call "%VENV_DIR%\Scripts\activate.bat"
+python -m pip install -U pip
+python -m pip install -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+
 if ERRORLEVEL 1 GOTO errorHandling
 
 set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
@@ -117,8 +134,8 @@ echo.
 echo ###############^|^| Install Model Optimizer prerequisites ^|^|###############
 echo.
 CALL :delay 3
-cd /d "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\install_prerequisites"
-call install_prerequisites_caffe.bat
+cd /d "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer"
+python -m pip install -r requirements.txt
 if ERRORLEVEL 1 GOTO errorHandling
 
 CALL :delay 7
@@ -205,7 +222,7 @@ if "!MSBUILD_BIN!" == "" (
     GOTO errorHandling
 )
 
-set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_samples_build"
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_cpp_samples_build"
 
 echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
 if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
diff --git a/scripts/demo/demo_squeezenet_download_convert_run.sh b/scripts/demo/demo_squeezenet_download_convert_run.sh
index 24fc26c335c..7f8427db8b9 100755
--- a/scripts/demo/demo_squeezenet_download_convert_run.sh
+++ b/scripts/demo/demo_squeezenet_download_convert_run.sh
@@ -3,14 +3,21 @@
 # Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+echo -ne "\e[0;33mWARNING: If you get an error when running the demo in the Docker container, you may need to install additional packages. To do this, run the container as root (-u 0) and run install_openvino_dependencies.sh script. If you get a package-independent error, try setting additional parameters using -sample-options.\e[0m\n"
+
 ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]-$0}" )" && pwd )"
+VENV_DIR="$HOME/venv_openvino"
 
 . "$ROOT_DIR/utils.sh"
 
 usage() {
     echo "Classification demo using public SqueezeNet topology"
-    echo "-d name     specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
-    echo "-help            print help message"
+    echo
+    echo "Options:"
+    echo "  -help                     Print help message"
+    echo "  -d DEVICE                 Specify the target device to infer on; CPU, GPU, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+    echo "  -sample-options OPTIONS   Specify command line arguments for the sample"
+    echo
     exit 1
 }
 
@@ -46,7 +53,7 @@ done
 
 target_precision="FP16"
 
-printf "target_precision = %s\n" "${target_precision}"
+echo -ne "target_precision = ${target_precision}\n"
 
 models_path="$HOME/openvino_models/models"
 models_cache="$HOME/openvino_models/cache"
@@ -57,13 +64,11 @@ model_name="squeezenet1.1"
 target_image_path="$ROOT_DIR/car.png"
 
 run_again="Then run the script again\n\n"
-dashes="\n\n###################################################\n\n"
-
 
 if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
     setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
 else
-    printf "Error: setupvars.sh is not found\n"
+    echo -ne "Error: setupvars.sh is not found\n"
 fi
 
 if ! . "$setupvars_path" ; then
@@ -71,14 +76,6 @@ if ! . "$setupvars_path" ; then
     exit 1
 fi
 
-# Step 1. Download the Caffe model and the prototxt of the model
-echo -ne "${dashes}"
-printf "\n\nDownloading the Caffe model and the prototxt"
-
-cur_path=$PWD
-
-printf "\nInstalling dependencies\n"
-
 if [[ -f /etc/centos-release ]]; then
     DISTRO="centos"
 elif [[ -f /etc/lsb-release ]]; then
@@ -86,55 +83,27 @@ elif [[ -f /etc/lsb-release ]]; then
 fi
 
 if [[ $DISTRO == "centos" ]]; then
-    sudo -E yum install -y centos-release-scl epel-release
-    sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
-                           glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
-
-    sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
-    sudo -E yum install -y epel-release
-    sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
-
     # check installed Python version
     if command -v python3.5 >/dev/null 2>&1; then
         python_binary=python3.5
-        pip_binary=pip3.5
     fi
     if command -v python3.6 >/dev/null 2>&1; then
         python_binary=python3.6
-        pip_binary=pip3.6
-    fi
-    if [ -z "$python_binary" ]; then
-        sudo -E yum install -y rh-python36 || true
-        . scl_source enable rh-python36
-        python_binary=python3.6
-        pip_binary=pip3.6
     fi
 elif [[ $DISTRO == "ubuntu" ]]; then
-    sudo -E apt update
-    print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
     python_binary=python3
-    pip_binary=pip3
-
-    system_ver=$(grep -i "DISTRIB_RELEASE" -f /etc/lsb-release | cut -d "=" -f2)
-    if [ "$system_ver" = "16.04" ]; then
-        sudo -E apt-get install -y libpng12-dev
-    else
-        sudo -E apt-get install -y libpng-dev
-    fi
 elif [[ "$OSTYPE" == "darwin"* ]]; then
     # check installed Python version
-    if command -v python3.7 >/dev/null 2>&1; then
+    if command -v python3.8 >/dev/null 2>&1; then
+        python_binary=python3.8
+    elif command -v python3.7 >/dev/null 2>&1; then
         python_binary=python3.7
-        pip_binary=pip3.7
     elif command -v python3.6 >/dev/null 2>&1; then
         python_binary=python3.6
-        pip_binary=pip3.6
     elif command -v python3.5 >/dev/null 2>&1; then
         python_binary=python3.5
-        pip_binary=pip3.5
     else
         python_binary=python3
-        pip_binary=pip3
     fi
 fi
 
@@ -143,47 +112,52 @@ if ! command -v $python_binary &>/dev/null; then
     exit 1
 fi
 
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    "$pip_binary" install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+if [ -e "$VENV_DIR" ]; then
+    echo -ne "\n###############|| Using the existing python virtual environment ||###############\n\n"
 else
-    sudo -E "$pip_binary" install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+    echo -ne "\n###############|| Creating the python virtual environment ||###############\n\n"
+    "$python_binary" -m venv "$VENV_DIR"
 fi
 
+. "$VENV_DIR/bin/activate"
+python -m pip install -U pip
+python -m pip install -r "$ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in"
+
+# Step 1. Download the Caffe model and the prototxt of the model
+echo -ne "\n###############|| Downloading the Caffe model and the prototxt ||###############\n\n"
+
 downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
 
-model_dir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
-    "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+model_dir=$(python "$downloader_dir/info_dumper.py" --name "$model_name" |
+    python -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
 
 downloader_path="$downloader_dir/downloader.py"
 
-print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
+print_and_run python "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
 
 ir_dir="${irs_path}/${model_dir}/${target_precision}"
 
 if [ ! -e "$ir_dir" ]; then
     # Step 2. Configure Model Optimizer
-    echo -ne "${dashes}"
-    printf "Install Model Optimizer dependencies\n\n"
-    cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/install_prerequisites"
-    . ./install_prerequisites.sh caffe
-    cd "$cur_path"
+    echo -ne "\n###############|| Install Model Optimizer dependencies ||###############\n\n"
+    cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer"
+    python -m pip install -r requirements.txt
+    cd "$PWD"
 
     # Step 3. Convert a model with Model Optimizer
-    echo -ne "${dashes}"
-    printf "Convert a model with Model Optimizer\n\n"
+    echo -ne "\n###############|| Convert a model with Model Optimizer ||###############\n\n"
 
     mo_path="${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/mo.py"
 
     export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
-    print_and_run "$python_binary" "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
+    print_and_run python "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
 else
-    printf "\n\nTarget folder %s already exists. Skipping IR generation  with Model Optimizer." "${ir_dir}"
+    echo -ne "\n\nTarget folder ${ir_dir} already exists. Skipping IR generation  with Model Optimizer."
     echo -ne "If you want to convert a model again, remove the entire ${ir_dir} folder. ${run_again}"
 fi
 
 # Step 4. Build samples
-echo -ne "${dashes}"
-printf "Build Inference Engine samples\n\n"
+echo -ne "\n###############|| Build Inference Engine samples ||###############\n\n"
 
 OS_PATH=$(uname -m)
 NUM_THREADS="-j2"
@@ -194,7 +168,7 @@ if [ "$OS_PATH" == "x86_64" ]; then
 fi
 
 samples_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/samples/cpp"
-build_dir="$HOME/inference_engine_samples_build"
+build_dir="$HOME/inference_engine_cpp_samples_build"
 binaries_dir="${build_dir}/${OS_PATH}/Release"
 
 if [ -e "$build_dir/CMakeCache.txt" ]; then
@@ -207,8 +181,7 @@ cmake -DCMAKE_BUILD_TYPE=Release "$samples_path"
 make $NUM_THREADS classification_sample_async
 
 # Step 5. Run samples
-echo -ne "${dashes}"
-printf "Run Inference Engine classification sample\n\n"
+echo -ne "\n###############|| Run Inference Engine classification sample ||###############\n\n"
 
 cd "$binaries_dir"
 
@@ -216,5 +189,4 @@ cp -f "$ROOT_DIR/${model_name}.labels" "${ir_dir}/"
 
 print_and_run ./classification_sample_async -d "$target" -i "$target_image_path" -m "${ir_dir}/${model_name}.xml" "${sampleoptions[@]}"
 
-echo -ne "${dashes}"
-printf "Demo completed successfully.\n\n"
+echo -ne "\n###############|| Demo completed successfully ||###############\n\n"
diff --git a/tests/conditional_compilation/test_config.yml b/tests/conditional_compilation/test_config.yml
index ac03e092fe1..a9f8a463d7a 100644
--- a/tests/conditional_compilation/test_config.yml
+++ b/tests/conditional_compilation/test_config.yml
@@ -7,8 +7,8 @@
     path: ${TESTDATA}/models/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224_i8.xml
 - model:
     path: ${TESTDATA}/models/inception_v3/inception_v3_i8.xml
-- model:
-    path: ${TESTDATA}/models/resnet_v1_50/resnet_v1_50_i8.xml
+#- model:
+#    path: ${TESTDATA}/models/resnet_v1_50/resnet_v1_50_i8.xml
 - model:
     path: ${TESTDATA}/models/test_model/test_model_fp16.xml
 - model:
diff --git a/tests/stress_tests/scripts/run_memcheck.py b/tests/stress_tests/scripts/run_memcheck.py
index b8f32b4c589..561ff1b9127 100755
--- a/tests/stress_tests/scripts/run_memcheck.py
+++ b/tests/stress_tests/scripts/run_memcheck.py
@@ -9,24 +9,22 @@ upload memory measurment results to database and generate reports.
 """
 
 import argparse
-from glob import glob
 import json
 import logging
 import os
 import subprocess
 import sys
-from pathlib import Path
+from glob import glob
 
+from compare_memcheck_2_runs import compare_memcheck_2_runs, \
+    get_memcheck_records, get_db_memcheck_records
+# Database arguments
+from memcheck_upload import DATABASE, DB_COLLECTIONS
 from memcheck_upload import create_memcheck_records, \
     upload_memcheck_records, \
     create_memcheck_report, \
     metadata_from_manifest, \
     info_from_test_config
-from compare_memcheck_2_runs import compare_memcheck_2_runs, \
-    get_memcheck_records, get_db_memcheck_records
-
-# Database arguments
-from memcheck_upload import DATABASE, DB_COLLECTIONS
 
 
 def run(args, log=None, verbose=True):
@@ -142,12 +140,12 @@ def main():
         else:
             if list(glob(os.path.join(args.output_dir, '**', '*.log'), recursive=True)):
                 logging.error(
-                    'Output directory %s already has test logs.' \
+                    'Output directory %s already has test logs.'
                     'Please specify an empty directory for output logs',
                     args.output_dir)
                 sys.exit(1)
 
-    returncode, _ = run([sys.executable, args.gtest_parallel] +
+    return_code, _ = run([sys.executable, args.gtest_parallel] +
                         (['--output_dir', f'{args.output_dir}'] if args.output_dir else []) +
                         (['--workers', f'{args.workers}'] if args.workers else []) +
                         (['--timeout', f'{args.timeout}'] if args.timeout else []) +
@@ -189,8 +187,11 @@ def main():
 
         # create timeline report
         if args.timeline_report:
-            create_memcheck_report(records, args.db_url, args.db_collection, args.timeline_report)
-            logging.info('Created memcheck timeline report %s', args.timeline_report)
+            try:
+                create_memcheck_report(records, args.db_url, args.db_collection, args.timeline_report)
+                logging.info('Created memcheck timeline report %s', args.timeline_report)
+            except Exception as ex:
+                logging.warning(f'Failed to create timeline report: {ex}')
 
         # compare runs and prepare report
         if args.compare:
@@ -203,9 +204,9 @@ def main():
                                                      db_name=DATABASE, db_url=args.db_url)
             compare_retcode = compare_memcheck_2_runs(cur_values=records, references=references,
                                                       output_file=args.comparison_report)
-            returncode = returncode if returncode else compare_retcode
+            return_code = return_code if return_code else compare_retcode
 
-    sys.exit(returncode)
+    sys.exit(return_code)
 
 
 if __name__ == "__main__":
diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt
index 5476b1d9cae..bda132a4a51 100644
--- a/thirdparty/CMakeLists.txt
+++ b/thirdparty/CMakeLists.txt
@@ -80,7 +80,7 @@ function(add_gtest_libraries)
     endif()
 
     set(BUILD_SHARED_LIBS OFF)
-    set(INSTALL_GTEST OFF)
+    set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
     add_subdirectory(gtest EXCLUDE_FROM_ALL)
 
     get_target_property(gtest_include_dirs gtest INTERFACE_INCLUDE_DIRECTORIES)
diff --git a/ngraph/frontend/cmake_static_protobuf/CMakeLists.txt b/thirdparty/cmake_static_protobuf/CMakeLists.txt
similarity index 89%
rename from ngraph/frontend/cmake_static_protobuf/CMakeLists.txt
rename to thirdparty/cmake_static_protobuf/CMakeLists.txt
index 4c37abac25d..121b20659ce 100644
--- a/ngraph/frontend/cmake_static_protobuf/CMakeLists.txt
+++ b/thirdparty/cmake_static_protobuf/CMakeLists.txt
@@ -16,7 +16,7 @@ set(BUILD_SHARED_LIBS OFF)
 set(BUILD_STANDALONE_STATIC ON)
 set(USE_STATIC_PROTOBUF ON)
 
-include(../../cmake/external_protobuf.cmake)
+add_subdirectory(../protobuf ${CMAKE_BINARY_DIR}/_deps/static-protobuf)
 
 set(BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS_OLD})
 set(BUILD_STANDALONE_STATIC ${BUILD_STANDALONE_STATIC_OLD})
diff --git a/thirdparty/ittapi/CMakeLists.txt b/thirdparty/ittapi/CMakeLists.txt
index 9cec9b6126a..c50ff92fa27 100644
--- a/thirdparty/ittapi/CMakeLists.txt
+++ b/thirdparty/ittapi/CMakeLists.txt
@@ -11,18 +11,7 @@ if(ENABLE_PROFILING_ITT)
             message(WARNING "Profiling option enabled, but no ITT library was found under INTEL_VTUNE_DIR")
         endif()
     else()
-        include(FetchContent)
-        FetchContent_Declare(
-            ext_ittapi
-            GIT_REPOSITORY https://github.com/intel/ittapi.git
-            GIT_TAG v3.18.6
-        )
-
-        FetchContent_GetProperties(ext_ittapi)
-        if(NOT ext_ittapi_POPULATED)
-            FetchContent_Populate(ext_ittapi)
-            add_subdirectory(${ext_ittapi_SOURCE_DIR} ${ext_ittapi_BINARY_DIR})
-        endif()
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ittapi ${CMAKE_BINARY_DIR}/_deps/ittapi)
 
         target_compile_definitions(ittnotify INTERFACE ENABLE_PROFILING_ITT)
         if (UNIX)
diff --git a/thirdparty/ittapi/ittapi b/thirdparty/ittapi/ittapi
new file mode 160000
index 00000000000..5416ee060ad
--- /dev/null
+++ b/thirdparty/ittapi/ittapi
@@ -0,0 +1 @@
+Subproject commit 5416ee060ad51fd9eac3d6fcd4c7274c1d3370d8
diff --git a/ngraph/cmake/external_onnx.cmake b/thirdparty/onnx/CMakeLists.txt
similarity index 51%
rename from ngraph/cmake/external_onnx.cmake
rename to thirdparty/onnx/CMakeLists.txt
index a345e62c3e4..9c212d03a01 100644
--- a/ngraph/cmake/external_onnx.cmake
+++ b/thirdparty/onnx/CMakeLists.txt
@@ -2,31 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-include(FetchContent)
-
 #------------------------------------------------------------------------------
-# ONNX.proto definition version
+# Configure and install libonnx ...
 #------------------------------------------------------------------------------
 
-set(ONNX_VERSION 1.8.1)
-
-#------------------------------------------------------------------------------
-# Download and install libonnx ...
-#------------------------------------------------------------------------------
-
-set(ONNX_GIT_REPO_URL https://github.com/onnx/onnx.git)
-set(ONNX_GIT_BRANCH rel-${ONNX_VERSION})
 set(NGRAPH_ONNX_NAMESPACE ngraph_onnx)
-set(ONNX_PATCH_FILE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/onnx_patch.diff")
-
-FetchContent_Declare(
-    ext_onnx
-    GIT_REPOSITORY ${ONNX_GIT_REPO_URL}
-    GIT_TAG ${ONNX_GIT_BRANCH}
-    GIT_SHALLOW TRUE
-    # apply patch to fix problems with symbols visibility for MSVC
-    PATCH_COMMAND git reset --hard HEAD && git apply --ignore-space-change --ignore-whitespace --verbose ${ONNX_PATCH_FILE}
-)
 
 macro(onnx_set_target_properties)
     target_include_directories(onnx SYSTEM PRIVATE "${Protobuf_INCLUDE_DIRS}")
@@ -55,19 +35,13 @@ macro(onnx_set_target_properties)
     export(TARGETS onnx onnx_proto NAMESPACE ngraph:: APPEND FILE "${NGRAPH_TARGETS_FILE}")
 endmacro()
 
-FetchContent_GetProperties(ext_onnx)
-if(NOT ext_onnx_POPULATED)
-    FetchContent_Populate(ext_onnx)
-    set(ONNX_USE_PROTOBUF_SHARED_LIBS ${BUILD_SHARED_LIBS} CACHE BOOL "Use dynamic protobuf by ONNX library")
-    set(ONNX_NAMESPACE ${NGRAPH_ONNX_NAMESPACE})
-    set(ONNX_USE_LITE_PROTO ${NGRAPH_USE_PROTOBUF_LITE} CACHE BOOL "Use protobuf lite for ONNX library")
-    set(ONNX_ML ON CACHE BOOL "Use ONNX ML")
-    if(CMAKE_CROSSCOMPILING)
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${SYSTEM_PROTOC})
-    endif()
-
-    add_subdirectory(${ext_onnx_SOURCE_DIR} ${ext_onnx_BINARY_DIR} EXCLUDE_FROM_ALL)
-    onnx_set_target_properties()
-else()
-    onnx_set_target_properties()
+set(ONNX_USE_PROTOBUF_SHARED_LIBS ${BUILD_SHARED_LIBS} CACHE BOOL "Use dynamic protobuf by ONNX library")
+set(ONNX_NAMESPACE ${NGRAPH_ONNX_NAMESPACE})
+set(ONNX_USE_LITE_PROTO ${NGRAPH_USE_PROTOBUF_LITE} CACHE BOOL "Use protobuf lite for ONNX library")
+set(ONNX_ML ON CACHE BOOL "Use ONNX ML")
+if(CMAKE_CROSSCOMPILING)
+    set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${SYSTEM_PROTOC})
 endif()
+
+add_subdirectory(onnx EXCLUDE_FROM_ALL)
+onnx_set_target_properties()
diff --git a/thirdparty/onnx/onnx b/thirdparty/onnx/onnx
new file mode 160000
index 00000000000..0807930c7f4
--- /dev/null
+++ b/thirdparty/onnx/onnx
@@ -0,0 +1 @@
+Subproject commit 0807930c7f46f3bac1f520c4a2e78710aa5d0af7
diff --git a/thirdparty/protobuf/CMakeLists.txt b/thirdparty/protobuf/CMakeLists.txt
new file mode 100644
index 00000000000..611e26d200b
--- /dev/null
+++ b/thirdparty/protobuf/CMakeLists.txt
@@ -0,0 +1,125 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+#------------------------------------------------------------------------------
+# Configure and install Google Protobuf ...
+#------------------------------------------------------------------------------
+
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF)
+
+if (MSVC)
+    set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "")
+endif()
+
+if(CMAKE_CROSSCOMPILING)
+    find_program(SYSTEM_PROTOC protoc PATHS ENV PATH)
+
+    if(SYSTEM_PROTOC)
+        execute_process(
+            COMMAND ${SYSTEM_PROTOC} --version
+            OUTPUT_VARIABLE PROTOC_VERSION
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+
+        string(REPLACE " " ";" PROTOC_VERSION ${PROTOC_VERSION})
+        list(GET PROTOC_VERSION -1 PROTOC_VERSION)
+
+        message("Detected system protoc version: ${PROTOC_VERSION}")
+    else()
+        message(FATAL_ERROR "System Protobuf is needed while cross-compiling")
+    endif()
+
+    set(protobuf_BUILD_PROTOC_BINARIES OFF CACHE BOOL "Build libprotoc and protoc compiler" FORCE)
+endif()
+
+if (CMAKE_GENERATOR STREQUAL "Ninja")
+    set(MAKE_UTIL make)
+else()
+    set(MAKE_UTIL $(MAKE))
+endif()
+
+set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build tests")
+set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build with zlib support")
+
+if (NOT BUILD_STANDALONE_STATIC)
+    add_subdirectory(protobuf/cmake EXCLUDE_FROM_ALL)
+    get_directory_property(protobuf_VERSION DIRECTORY protobuf/cmake DEFINITION protobuf_VERSION)
+endif()
+if (USE_STATIC_PROTOBUF)
+    include(FetchContent)
+    FetchContent_Declare(
+            ext_protobuf_static
+            URL ${CMAKE_CURRENT_SOURCE_DIR}/protobuf
+    )
+    FetchContent_GetProperties(ext_protobuf_static)
+    if((NOT ext_protobuf_static_POPULATED) AND BUILD_STANDALONE_STATIC)
+        FetchContent_Populate(ext_protobuf_static)
+        add_subdirectory(${ext_protobuf_static_SOURCE_DIR}/cmake ${ext_protobuf_static_BINARY_DIR} EXCLUDE_FROM_ALL)
+        get_directory_property(protobuf_VERSION DIRECTORY ${ext_protobuf_static_SOURCE_DIR}/cmake DEFINITION protobuf_VERSION)
+    endif()
+endif()
+
+if (BUILD_STANDALONE_STATIC)
+    set(Protobuf_INCLUDE_DIRS ${ext_protobuf_static_SOURCE_DIR}/src)
+else()
+    set(Protobuf_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/protobuf/src)
+endif()
+
+if(NGRAPH_USE_PROTOBUF_LITE)
+    set(Protobuf_LIBRARIES libprotobuf-lite)
+else()
+    set(Protobuf_LIBRARIES libprotobuf)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+    set(_proto_libs ${Protobuf_LIBRARIES})
+    if(TARGET libprotoc)
+        list(APPEND _proto_libs libprotoc)
+            target_compile_options(libprotoc PRIVATE -Wno-all -Wno-unused-variable)
+    endif()
+    set_target_properties(${_proto_libs} PROPERTIES
+        CXX_VISIBILITY_PRESET default
+        C_VISIBILITY_PRESET default
+        VISIBILITY_INLINES_HIDDEN OFF)
+        foreach(target libprotobuf libprotobuf-lite)
+            target_compile_options(${target}
+                PRIVATE -Wno-all -Wno-unused-variable -Wno-inconsistent-missing-override
+                PUBLIC -Wno-undef)
+        endforeach()
+endif()
+
+if(NGRAPH_USE_PROTOBUF_LITE)
+    # if only libprotobuf-lite is used, both libprotobuf and libprotobuf-lite are built
+    # libprotoc target needs symbols from libprotobuf, even in libprotobuf-lite configuration
+    set_target_properties(libprotobuf PROPERTIES
+        CXX_VISIBILITY_PRESET default
+        C_VISIBILITY_PRESET default
+        VISIBILITY_INLINES_HIDDEN OFF)
+endif()
+
+if(protobuf_VERSION VERSION_LESS "3.9" AND NGRAPH_USE_PROTOBUF_LITE)
+    message(FATAL_ERROR "Minimum supported version of protobuf-lite library is 3.9.0")
+endif()
+
+if(ENABLE_LTO AND protobuf_VERSION VERSION_GREATER_EQUAL "3.8")
+    message(WARNING "Protobuf in version 3.8.0+ can throw runtime exceptions if LTO is enabled.")
+endif()
+
+if(CMAKE_CROSSCOMPILING AND NOT PROTOC_VERSION VERSION_EQUAL protobuf_VERSION)
+    message(WARNING "system protobuf version does not match with the compiled one, please update system protobuf or submodule")
+endif()
+
+if (NOT BUILD_STANDALONE_STATIC)
+    message("NGRAPH_INSTALL_LIB = ${NGRAPH_INSTALL_LIB}")
+    install(TARGETS ${Protobuf_LIBRARIES}
+        RUNTIME DESTINATION ${NGRAPH_INSTALL_LIB} COMPONENT ngraph
+        ARCHIVE DESTINATION ${NGRAPH_INSTALL_LIB} COMPONENT ngraph
+        LIBRARY DESTINATION ${NGRAPH_INSTALL_LIB} COMPONENT ngraph)
+    export(TARGETS ${Protobuf_LIBRARIES} NAMESPACE ngraph:: APPEND FILE "${NGRAPH_TARGETS_FILE}")
+endif()
+
+# forward variables used in the other places
+set(SYSTEM_PROTOC ${SYSTEM_PROTOC} PARENT_SCOPE)
+set(Protobuf_LIBRARIES ${Protobuf_LIBRARIES} PARENT_SCOPE)
+set(Protobuf_INCLUDE_DIRS ${Protobuf_INCLUDE_DIRS} PARENT_SCOPE)
diff --git a/thirdparty/protobuf/protobuf b/thirdparty/protobuf/protobuf
new file mode 160000
index 00000000000..52b2447247f
--- /dev/null
+++ b/thirdparty/protobuf/protobuf
@@ -0,0 +1 @@
+Subproject commit 52b2447247f535663ac1c292e088b4b27d2910ef