Merge remote-tracking branch 'upstream/master'

2021-06-08 09:49:38 +09:00
parent b2bdc39760 4d9fe14ec6
commit 19ab2100a5
234 changed files with 7665 additions and 1825 deletions
--- a/.ci/azure/linux.yml
+++ b/.ci/azure/linux.yml
@@ -82,9 +82,10 @@ jobs:
  - script: |
      sudo apt --assume-yes install libusb-1.0-0-dev
      # For opencv-python: setuptools and upgrade
-      sudo apt-get install python3-setuptools
+      sudo apt-get install python3-setuptools patchelf
      python3 -m pip install --upgrade pip
      python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/requirements.txt
+      python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/wheel/requirements-dev.txt
      # For running Python API tests
      python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/src/requirements-dev.txt
      # Speed up build
@@ -106,6 +107,7 @@ jobs:
        -DCMAKE_BUILD_TYPE=$(BUILD_TYPE)
        -DENABLE_PYTHON=ON
        -DPYTHON_EXECUTABLE=/usr/bin/python3.6
+        -DENABLE_WHEEL=ON
        -DENABLE_TESTS=ON
        -DNGRAPH_ONNX_IMPORT_ENABLE=ON
        -DNGRAPH_ONNX_EDITOR_ENABLE=ON
--- a/.ci/azure/linux_onnxruntime.yml
+++ b/.ci/azure/linux_onnxruntime.yml
@@ -94,7 +94,6 @@ jobs:
        -DENABLE_PROFILING_ITT=OFF
        -DENABLE_SAMPLES=OFF
        -DENABLE_SPEECH_DEMO=OFF
-        -DENABLE_PYTHON=ON
        -DNGRAPH_ONNX_IMPORT_ENABLE=ON
        -DNGRAPH_ONNX_EDITOR_ENABLE=ON
        -DNGRAPH_INTERPRETER_ENABLE=ON
--- a/.github/org_control/init.py
+++ b/.github/org_control/init.py
@@ -1,3 +0,0 @@
-# Copyright (C) 2018-2021 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
--- a/.github/org_control/check_org.py
+++ b/.github/org_control/check_org.py
@@ -5,12 +5,13 @@
 Check GitHub organization and invite members
 """

-# pylint: disable=fixme,no-member
+# pylint: disable=fixme,no-member,too-many-locals

 from argparse import ArgumentParser

-import github_api
 from configs import Config
+from github_api import GithubOrgApi, get_dev_emails
+from ldap_api import LdapApi, print_user_info, InfoLevel


 def main():
@@ -19,32 +20,74 @@ def main():
    arg_parser.add_argument("--cfg-file", metavar="PATH", default=Config.default_cfg_path,
                            help=f"Path to json configuration file, e.g. {Config.default_cfg_path}")
    arg_parser.add_argument("--teams", action="store_true", help="Check GitHub teams")
+    arg_parser.add_argument("--no-ldap", action="store_true", help="Don't use LDAP info")
    args, unknown_args = arg_parser.parse_known_args()

    Config(args.cfg_file, unknown_args)
-    gh_api = github_api.GithubOrgApi()
+    gh_api = GithubOrgApi()

    if args.teams:
        gh_api.get_org_teams()
-    else:
-        dev_emails = github_api.get_dev_emails()
-        print(f'\nDeveloper emails {len(dev_emails)}:', '; '.join(dev_emails))
+        return

-        org_emails = gh_api.get_org_emails()
-        print(f'\nOrg emails {len(org_emails)}:', '; '.join(org_emails))
+    cfg_emails = get_dev_emails()
+    print(f'\nCfg developer emails {len(cfg_emails)}:', '; '.join(sorted(cfg_emails)))

-        org_pendig_invitation_emails = gh_api.get_org_invitation_emails()
+    dev_emails = set()
+    dev_emails.update(cfg_emails)

-        invite_emails = dev_emails.difference(org_emails).difference(org_pendig_invitation_emails)
-        print(f'\nInvite emails {len(invite_emails)}:', '; '.join(invite_emails))
+    if not args.no_ldap:
+        ldap_api = LdapApi()
+        ldap_emails = ldap_api.get_user_emails()
+        dev_emails.update(ldap_emails)
+        print(f'\nLDAP developer emails {len(ldap_emails)}:', '; '.join(sorted(ldap_emails)))

-        no_in_dev_emails = org_emails.difference(dev_emails)
-        print(f'\nOrg members - no in developers list {len(no_in_dev_emails)}:',
-              '; '.join(no_in_dev_emails))
+        cfg_emails_no_in_ldap = ldap_api.get_absent_emails(cfg_emails)
+        print(f'\nCfg developer emails - absent in LDAP at all {len(cfg_emails_no_in_ldap)}:',
+              '; '.join(sorted(cfg_emails_no_in_ldap)))

-        valid_github_users = gh_api.get_valid_github_users(invite_emails)
+        cfg_ldap_inters = cfg_emails.intersection(ldap_emails)
+        print(f'\nCfg developer emails - present in LDAP developers {len(cfg_ldap_inters)}:',
+              '; '.join(sorted(cfg_ldap_inters)))

-        gh_api.invite_users(valid_github_users)
+    org_emails, org_logins_no_intel_email = gh_api.get_org_emails()
+    print(f'\nOrg emails {len(org_emails)}:', '; '.join(sorted(org_emails)))
+
+    org_emails_no_in_ldap = set()
+    if not args.no_ldap:
+        org_ldap_diff = org_emails.difference(ldap_emails)
+        print(f'\nOrg member emails - absent in LDAP developers {len(org_ldap_diff)}:',
+              '; '.join(sorted(org_ldap_diff)))
+
+        for email in org_ldap_diff:
+            user_info = ldap_api.get_user_info_by_email(email)
+            if user_info:
+                print_user_info(user_info, InfoLevel.PDL)
+            else:
+                org_emails_no_in_ldap.add(email)
+
+    org_pendig_invitation_emails = gh_api.get_org_invitation_emails()
+    invite_emails = dev_emails.difference(org_emails).difference(org_pendig_invitation_emails)
+    print(f'\nInvite emails {len(invite_emails)}:', '; '.join(sorted(invite_emails)))
+
+    valid_github_users = gh_api.get_valid_github_users(invite_emails)
+    gh_api.invite_users(valid_github_users)
+
+    print('\nCheck accounts below and remove from the GitHub organization and cfg list')
+
+    cfg_emails_no_in_org = sorted(cfg_emails.difference(org_emails))
+    print(f'\nCfg developer emails - absent in GitHub organization {len(cfg_emails_no_in_org)}:',
+          '; '.join(cfg_emails_no_in_org))
+
+    org_emails_no_in_dev = sorted(org_emails.difference(dev_emails))
+    print(f'\nOrg member emails - absent in cfg and LDAP developers {len(org_emails_no_in_dev)}:',
+          '; '.join(org_emails_no_in_dev))
+
+    print(f'\nOrg member emails - absent in LDAP at all {len(org_emails_no_in_ldap)}:',
+          '; '.join(sorted(org_emails_no_in_ldap)))
+
+    print(f'\nOrg member logins - absent Intel email {len(org_logins_no_intel_email)}:',
+          '; '.join(sorted(org_logins_no_intel_email)))


 if __name__ == '__main__':
--- a/.github/org_control/check_pr.py
+++ b/.github/org_control/check_pr.py
@@ -8,6 +8,7 @@ Check GitHub PRs and set labels by type and categories, e.g. 'ExternalPR', 'cate
 # pylint: disable=fixme,no-member

 import re
+import sys
 import datetime
 from argparse import ArgumentParser
 from enum import Enum
@@ -18,10 +19,11 @@ from configs import Config

 class PrType(Enum):
    """Constants for type of GitHub pull request by author membership"""
-    EXTERNAL = 'ExternalPR'
-    INTEL = 'ExternalIntelPR'
-    ORG = 'OpenvinoPR'
-    BAD = 'BadPR'
+
+    EXTERNAL = "ExternalPR"
+    INTEL = "ExternalIntelPR"
+    ORG = "OpenvinoPR"
+    BAD = "BadPR"


 def get_pr_labels(pull):
@@ -33,13 +35,23 @@ def get_pr_labels(pull):


 def set_pr_labels(pull, labels):
-    """Sets PR labels"""
+    """Sets new PR labels (all previously set labels are removed)"""
    if not labels or Config().DRY_RUN:
        return
-    print(f'Set PR labels:', labels)
+    print("Set PR labels:", labels)
+    # set_labels() should accept list but fails with empty "AssertionError:"
    pull.set_labels(labels)


+def add_pr_labels(pull, labels):
+    """Adds PR labels"""
+    if not labels or Config().DRY_RUN:
+        return
+    print("Add PR labels:", labels)
+    for label in labels:
+        pull.add_to_labels(label)
+
+
 def get_pr_type_by_labels(pull):
    """Gets PR type using labels"""
    pr_lables = get_pr_labels(pull)
@@ -48,19 +60,19 @@ def get_pr_type_by_labels(pull):
    if not pr_types_labels:
        return None
    if len(pr_types_labels) > 1:
-        print(f'Duplicated labels: {pr_types_labels}')
+        print(f"Duplicated labels: {pr_types_labels}")
        return PrType.BAD
    return PrType(PrType(pr_types_labels.pop()))


 def get_label_by_team_name_re(team_name):
    """Generates label by PR reviwer team name using regular expressions"""
-    if 'admins' in team_name:
-        return 'category: ci'
-    re_compile_label = re.compile(rf'{Config().GITHUB_REPO}-(.+)-maintainers')
+    if "admins" in team_name:
+        return "category: ci"
+    re_compile_label = re.compile(rf"{Config().GITHUB_REPO}-(.+)-maintainers")
    re_label = re_compile_label.match(team_name)
    if re_label:
-        return f'category: {re_label.group(1).strip()}'
+        return f"category: {re_label.group(1).strip()}"
    return None


@@ -80,17 +92,105 @@ def get_category_labels(pull):
    return labels


+def get_pr_info_str(pull):
+    """Gets info about PR using a few workarounds"""
+    pr_title = pull.title.encode("ASCII", "ignore").decode()
+
+    # Workaround for PyGithub issue: https://github.com/PyGithub/PyGithub/issues/512
+    pr_created_at = pull.created_at.replace(tzinfo=datetime.timezone.utc).astimezone()
+
+    return (
+        f"PR: {pull.number} - {pr_title} - Created: {pr_created_at} - "
+        f"Labels: {get_pr_labels(pull)} - Type: {get_pr_type_by_labels(pull)}"
+    )
+
+
+def update_labels(gh_api, pull, non_org_intel_pr_users, non_org_pr_users):
+    """Checks and updates labels"""
+    print("Check and update labels:")
+    pr_type_by_labels = get_pr_type_by_labels(pull)
+    add_labels = []
+
+    # Checks PR source type
+    if gh_api.is_org_user(pull.user):
+        print(" - Org user")
+    elif github_api.is_intel_email(pull.user.email) or github_api.is_intel_company(
+        pull.user.company
+    ):
+        print(" - Non org user with Intel email or company")
+        non_org_intel_pr_users.add(pull.user)
+        if pr_type_by_labels is not PrType.INTEL:
+            print(f'NO "{PrType.INTEL.value}" label: ', end="")
+            github_api.print_users(pull.user)
+            add_labels.append(PrType.INTEL.value)
+    elif github_api.is_user_ignored(pull.user):
+        print(" - IGNORED non org user with NO Intel email or company")
+    else:
+        print(" - Non org user with NO Intel email or company")
+        non_org_pr_users.add(pull.user)
+        if pr_type_by_labels is not PrType.EXTERNAL:
+            print(f'NO "{PrType.EXTERNAL.value}" label: ', end="")
+            github_api.print_users(pull.user)
+            add_labels.append(PrType.EXTERNAL.value)
+
+    add_labels += get_category_labels(pull)
+    add_pr_labels(pull, add_labels)
+
+
+def get_wrong_commits(pull):
+    """Returns commits with incorrect user and email"""
+    pr_author_email = pull.user.email.lower()
+    print("GitHub PR author email:", pr_author_email)
+    print("Check commits:")
+    wrong_commits = set()
+    for commit in pull.get_commits():
+        # import pprint; pprint.pprint(commit.raw_data)
+        print("Commit SHA:", commit.sha)
+        # Use raw data because commit author can be non GitHub user
+        commit_email = commit.raw_data["commit"]["author"]["email"].lower()
+        print("    Commit email:", commit_email)
+        if not github_api.is_valid_user(commit.author):
+            print(
+                "    ERROR: User with the commit email is absent in GitHub:",
+                commit.raw_data["commit"]["author"]["name"],
+            )
+            wrong_commits.add(commit.sha)
+        if not commit.raw_data["commit"]["verification"]["verified"]:
+            print(
+                "    WARNING: The commit is not verified. Reason:",
+                commit.raw_data["commit"]["verification"]["reason"],
+            )
+            if pr_author_email != commit_email:
+                print("    WARNING: Commit email and GitHub PR author public email are differnt")
+    return wrong_commits
+
+
 def main():
    """The main entry point function"""
    arg_parser = ArgumentParser()
-    arg_parser.add_argument("--cfg-file", metavar="PATH", default=Config.default_cfg_path,
-                            help=f"Path to json configuration file, e.g. {Config.default_cfg_path}")
-    arg_parser.add_argument("--pr", metavar="NUMBER",
-                            help="Get GitHub pull request with the number")
-    arg_parser.add_argument("--pr-state", default="open", choices=["open", "closed"],
-                            help="Set GitHub pull request state")
-    arg_parser.add_argument("--newer", metavar="MINUTES",
-                            help="Get newly created GitHub pull request only")
+    arg_parser.add_argument(
+        "--cfg-file",
+        metavar="PATH",
+        default=Config.default_cfg_path,
+        help=f"Path to json configuration file, e.g. {Config.default_cfg_path}",
+    )
+    arg_parser.add_argument(
+        "--pr", metavar="NUMBER", help="Get GitHub pull request with the number"
+    )
+    arg_parser.add_argument(
+        "--pr-state",
+        default="open",
+        choices=["open", "closed"],
+        help="Set GitHub pull request state",
+    )
+    arg_parser.add_argument(
+        "--newer", metavar="MINUTES", help="Get newly created GitHub pull request only"
+    )
+    arg_parser.add_argument(
+        "--check-commits",
+        action="store_true",
+        help="Check and compare git commit email with GitHub account email",
+    )
    args, unknown_args = arg_parser.parse_known_args()

    Config(args.cfg_file, unknown_args)
@@ -100,50 +200,52 @@ def main():
        pulls = [gh_api.repo.get_pull(int(args.pr))]
    else:
        pulls = gh_api.repo.get_pulls(state=args.pr_state)
-        print(f'\nPRs count ({args.pr_state}):', pulls.totalCount)
+        print(f"\nPRs count ({args.pr_state}):", pulls.totalCount)

    if args.newer:
-        pr_created_after = datetime.datetime.now() - datetime.timedelta(minutes=int(args.newer))
-        print('PRs created after:', pr_created_after)
+        pr_created_after = (
+            datetime.datetime.now() - datetime.timedelta(minutes=int(args.newer))
+        ).astimezone()
+        print("Checking PRs created after:", pr_created_after)
+
    non_org_intel_pr_users = set()
    non_org_pr_users = set()
+    wrong_pulls = {}
+
    for pull in pulls:
-        if args.newer and pull.created_at <= pr_created_after:
-            print(f'\nIGNORE: {pull} - Created: {pull.created_at}')
+        pr_created_at = pull.created_at.replace(tzinfo=datetime.timezone.utc).astimezone()
+        if args.newer and pr_created_at <= pr_created_after:
+            print(f"\nIGNORE: {get_pr_info_str(pull)}")
            continue
-        pr_lables = get_pr_labels(pull)
-        pr_type_by_labels = get_pr_type_by_labels(pull)
-        set_labels = []
-        print(f'\n{pull} - Created: {pull.created_at} - Labels: {pr_lables} -',
-              f'Type: {pr_type_by_labels}', end='')

-        # Checks PR source type
-        if gh_api.is_org_user(pull.user):
-            print(' - Org user')
-        elif github_api.is_intel_email(pull.user.email) or \
-             github_api.is_intel_company(pull.user.company):
-            print(' - Non org user with Intel email or company')
-            non_org_intel_pr_users.add(pull.user)
-            if pr_type_by_labels is not PrType.INTEL:
-                print(f'NO "{PrType.INTEL.value}" label: ', end='')
-                github_api.print_users(pull.user)
-                set_labels.append(PrType.INTEL.value)
+        print(f"\n{get_pr_info_str(pull)}")
+        if args.check_commits:
+            wrong_commits = get_wrong_commits(pull)
+            if wrong_commits:
+                wrong_pulls[pull.number] = wrong_commits
        else:
-            print(f' - Non org user with NO Intel email or company')
-            non_org_pr_users.add(pull.user)
-            if pr_type_by_labels is not PrType.EXTERNAL:
-                print(f'NO "{PrType.EXTERNAL.value}" label: ', end='')
-                github_api.print_users(pull.user)
-                set_labels.append(PrType.EXTERNAL.value)
+            update_labels(gh_api, pull, non_org_intel_pr_users, non_org_pr_users)

-        set_labels += get_category_labels(pull)
-        set_pr_labels(pull, set_labels)
+    if wrong_pulls:
+        for pull_number, wrong_commits in wrong_pulls.items():
+            print(
+                f"\nERROR: Remove or replace wrong commits in the PR {pull_number}:\n    ",
+                "\n    ".join(wrong_commits),
+            )
+        print(
+            "\nAbout commit signature verification:\n    ",
+            "https://docs.github.com/en/github/authenticating-to-github/"
+            "managing-commit-signature-verification/about-commit-signature-verification",
+        )
+        sys.exit(1)

-    print(f'\nNon org user with Intel email or company:')
-    github_api.print_users(non_org_intel_pr_users)
-    print(f'\nNon org user with NO Intel email or company:')
-    github_api.print_users(non_org_pr_users)
+    if non_org_intel_pr_users:
+        print("\nNon org user with Intel email or company:")
+        github_api.print_users(non_org_intel_pr_users)
+    if non_org_pr_users:
+        print("\nNon org user with NO Intel email or company:")
+        github_api.print_users(non_org_pr_users)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/.github/org_control/config.json
+++ b/.github/org_control/config.json
@@ -6,7 +6,9 @@
        "openvino-ci",
        "openvino-pushbot",
        "lab-nerval",
-        "lab-nerval-onnx-ci"
+        "lab-nerval-onnx-ci",
+        "onnx-watchdog-agent",
+        "dependabot"
    ],
    "EMAILS_FILE_PATH": "dev_emails-test.txt",
    "PROXIES": {
--- a/.github/org_control/configs.py
+++ b/.github/org_control/configs.py
@@ -57,19 +57,19 @@ class Config:
        for name, value in self._json_cfg.items():
            if hasattr(self, name):
                raise ConfigException(f'Duplicating prosperity: {name}')
-            prosperity_value = self._args.get(name) or os.getenv(name)
-            if prosperity_value:
+            property_value = self._args.get(name) or os.getenv(name)
+            if property_value:
                # Try to set prosperity_value as Python literal structures, e.g. DRY_RUN=False
                try:
-                    prosperity_value = ast.literal_eval(prosperity_value)
+                    property_value = ast.literal_eval(property_value)
                except Exception:
                    pass
-                if not isinstance(prosperity_value, type(value)):
+                if not isinstance(property_value, type(value)):
                    raise ConfigException(f'Python type of {name} parameter must be {type(value)}')
            else:
-                prosperity_value = value
-            setattr(self, name, prosperity_value)
-            Config.properties[name] = prosperity_value
+                property_value = value
+            setattr(self, name, property_value)
+            Config.properties[name] = property_value

        self.set_proxy()

@@ -78,7 +78,7 @@ class Config:
        try:
            with open(self._file_path) as conf:
                self._json_cfg = json.load(conf)
-        except:
+        except Exception:
            print('Failed to load configuration from:', self._file_path)
            raise

@@ -105,7 +105,7 @@ class Config:
 def _test():
    """Test and debug"""
    print('Config.default_cfg_path:', Config.default_cfg_path)
-    cfg = Config(cli_args=['DRY_RUN=True'])
+    cfg = Config(cli_args=['DRY_RUN', 'PROXIES={"NO_PROXY": "localhost"}'])
    print('Config.properties:', cfg.get_properties())


--- a/.github/org_control/github_api.py
+++ b/.github/org_control/github_api.py
@@ -11,7 +11,6 @@ import re
 import time

 from github import Github, GithubException, RateLimitExceededException, IncompletableObject
-from github import UnknownObjectException
 from github.PaginatedList import PaginatedList

 from configs import Config
@@ -110,17 +109,13 @@ class GithubOrgApi:
    def is_org_user(self, user):
        """Checks that user is a member of GitHub organization"""
        if is_valid_user(user):
-            try:
-                membership = user.get_organization_membership(self.github_org)
-                # membership.role can be 'member' or 'admin'
-                if membership.state == 'active' and membership.role:
-                    return True
-            except UnknownObjectException:
-                pass
+            # user.get_organization_membership(self.github_org) doesn't work with org members
+            # permissions, GITHUB_TOKEN must be org owner now
+            return self.github_org.has_in_members(user)
        return False

    def get_org_emails(self):
-        """Gets and prints all emails of GitHub organization members"""
+        """Gets and prints emails of all GitHub organization members"""
        org_members = self.github_org.get_members()
        org_emails = set()
        org_members_fix = set()
@@ -146,7 +141,7 @@ class GithubOrgApi:
              '; '.join(org_logins_fix_intel_email))
        print(f'\nOrg members - no real name {len(org_emails_fix_name)}:',
              '; '.join(org_emails_fix_name))
-        return org_emails
+        return (org_emails, org_logins_fix_intel_email)

    def get_org_invitation_emails(self):
        """Gets GitHub organization teams prints info"""
--- a/.github/org_control/ldap_api.py
+++ b/.github/org_control/ldap_api.py
@@ -0,0 +1,236 @@
+# Copyright (C) 2018-2021 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Gets info about users and groups via LDAP
+"""
+
+# pylint: disable=fixme,no-member
+
+from enum import Enum
+
+from ldap3 import Server, Connection, ALL, SUBTREE
+
+from configs import Config
+
+
+class LdapApiException(Exception):
+    """Base LDAP API exception"""
+
+
+class InfoLevel(Enum):
+    """Constants for printing user info from LDAP"""
+    PDL = 'PDL' # Public Distribution List (group of e-mail addresses)
+    FULL = 'Full'
+
+
+def print_user_info(info, info_level=None):
+    """Pretty-print of a user info data structure (dict). info_level is the InfoLevel Enum"""
+    if not info or not info.get('mail'):
+        raise LdapApiException('ERROR: No info or absent mail')
+
+    def get_membership():
+        if info_level == InfoLevel.PDL:
+            membership_info = '  PDLs:'
+        elif info_level == InfoLevel.FULL:
+            membership_info = '  memberOf :'
+        else:
+            return ''
+        # Grouping groups by purpose
+        if info_level == InfoLevel.PDL:
+            sort_key = lambda i: i.split(',', 1)[0].lower()
+        else:
+            sort_key = lambda i: i.split(',', 1)[1] + i.split(',', 1)[0].lower()
+        for item in sorted(info['memberOf'], key=sort_key):
+            if info_level == InfoLevel.PDL and 'OU=Delegated' not in item:
+                continue
+            membership_info += f'\n    {item}'
+        return membership_info
+
+    try:
+        text_info = \
+            f'\n{info["cn"]} <{info["mail"]}>; {info["sAMAccountName"]}; {info["employeeID"]}' \
+            f'\n  Org group: {info["intelSuperGroupDescr"]} ({info["intelSuperGroupShortName"]}) /'\
+            f' {info["intelGroupDescr"]} ({info["intelGroupShortName"]}) /' \
+            f' {info["intelDivisionDescr"]} ({info["intelDivisionShortName"]}) /' \
+            f' {info["intelOrgUnitDescr"]}' \
+            f'\n  Manager: {info["manager"]}' \
+            f'\n  Location: {info["intelRegionCode"]} / {info["co"]} / {info["intelSiteCode"]} /' \
+            f' {info["intelBldgCode"]} ({info["intelSiteName"]}) /' \
+            f' {info["physicalDeliveryOfficeName"]}' \
+            f'\n  Other: {info["employeeType"]} | {info["intelExportCountryGroup"]} |' \
+            f' {info["whenCreated"]} | {info["intelCostCenterDescr"]} | {info["jobDescription"]}'
+    except Exception as exc:
+        raise LdapApiException(f'ERROR: Failed to get info about "{info["mail"]}". ' \
+                               f'Exception occurred:\n{repr(exc)}') from exc
+    print(text_info)
+
+    membership = get_membership()
+    if info_level == InfoLevel.PDL and membership:
+        print(membership)
+    elif info_level == InfoLevel.FULL:
+        for key in sorted(info):
+            if isinstance(info[key], list):
+                if key == 'memberOf':
+                    print(membership)
+                else:
+                    print(f'  {key} :')
+                    for item in info[key]:
+                        print('   ', item)
+            else:
+                print(f'  {key} : {info[key]}')
+
+
+class LdapApi:
+    """LDAP API for getting user info and emails"""
+
+    _binary_blobs = ['thumbnailPhoto', 'msExchUMSpokenName', 'msExchBlockedSendersHash']
+    _check_existing = [
+        'intelExportCountryGroup',
+        'physicalDeliveryOfficeName',
+        'intelSuperGroupShortName',
+        'intelGroupShortName',
+        'intelDivisionShortName',
+    ]
+
+    null = '<null>'
+
+    def __init__(self):
+        self._cfg = Config()
+        self.server = Server(self._cfg.LDAP_SERVER, get_info=ALL)
+        self.connection = Connection(self.server,
+                                     user=self._cfg.LDAP_USER,
+                                     password=self._cfg.LDAP_PASSWORD,
+                                     auto_bind=True)
+        self.connection.bind()
+
+    def get_user_emails(self, groups=None):
+        """Gets emails of LDAP groups and sub-groups"""
+        print('\nGet emails from LDAP groups:')
+        processed_ldap_members = {}
+
+        def process_group_members(member, parent_group):
+            if member in processed_ldap_members:
+                processed_ldap_members[member]['parent_groups'].append(parent_group)
+                print('\nWARNING: Ignore LDAP member to avoid duplication and recursive cycling '
+                      f'of PDLs: {member}\n    '
+                      f'email: {processed_ldap_members[member].get("email")}\n    parent_groups:')
+                for group in processed_ldap_members[member].get('parent_groups', []):
+                    print(7 * ' ', group)
+
+                return
+            processed_ldap_members[member] = {'email': None, 'parent_groups': [parent_group]}
+
+            # AD moves terminated users to the boneyard OU in case the user returns,
+            # so it can be reactivated with little effort.
+            # After 30 days it is removed and the unix personality becomes unlinked.
+            if 'OU=Boneyard' in member:
+                return
+            self.connection.search(member, r'(objectClass=*)', SUBTREE,
+                                   attributes=['cn', 'member', 'mail'])
+
+            #print(self.connection.entries)
+            if not self.connection.response:
+                raise LdapApiException(f'ERROR: empty response. LDAP member: {member}')
+
+            # Check that the member is worker.
+            # The response can contain several items, but the first item is valid only
+            if 'OU=Workers' in member:
+                if self.connection.response[0]['attributes']['mail']:
+                    processed_ldap_members[member]['email'] = \
+                        self.connection.response[0]['attributes']['mail'].lower()
+                    return
+                raise LdapApiException(f'ERROR: no mail. LDAP worker: {member}\n'
+                                       f'{self.connection.entries}')
+
+            if len(self.connection.response) > 1:
+                raise LdapApiException(f'ERROR: multiple responses for {member}: '
+                                       f'{len(self.connection.response)}\n'
+                                       f'{self.connection.entries}')
+
+            if self.connection.response[0]['attributes']['member']:
+                for group_member in self.connection.response[0]['attributes']['member']:
+                    process_group_members(group_member, member)
+            else:
+                print(f'\nERROR: no members in LDAP group: {member}\n{self.connection.entries}')
+
+        for group in groups or self._cfg.LDAP_PDLs:
+            print('\nProcess ROOT LDAP group:', group)
+            process_group_members(group, 'ROOT')
+        return {
+            member.get('email') for member in processed_ldap_members.values() if member.get('email')
+        }
+
+
+    def _get_user_info(self, query):
+        """Gets user info from LDAP as dict matching key and values pairs from query"""
+        query_filter = ''.join(f'({key}={value})' for key, value in query.items())
+
+        for domain in self._cfg.LDAP_DOMAINS:
+            search_base = f'OU=Workers,DC={domain},DC=corp,DC=intel,DC=com'
+            self.connection.search(
+                search_base,
+                f'(&(objectcategory=person)(objectclass=user)(intelflags=1){query_filter})',
+                SUBTREE,
+                attributes=['*'])
+
+            if self.connection.response:
+                if len(self.connection.response) > 1:
+                    raise LdapApiException(f'ERROR: multiple responses for {query_filter}: '
+                                           f'{len(self.connection.response)}\n'
+                                           f'{self.connection.entries}')
+                info = self.connection.response[0]['attributes']
+
+                # remove long binary blobs
+                for blob in LdapApi._binary_blobs:
+                    info[blob] = b''
+                for key in LdapApi._check_existing:
+                    if not info.get(key):
+                        info[key] = LdapApi.null
+                return info
+        return {}
+
+
+    def get_user_info_by_idsid(self, idsid):
+        """Gets user info from LDAP as dict using account name for searching"""
+        return self._get_user_info({'sAMAccountName': idsid})
+
+
+    def get_user_info_by_name(self, name):
+        """Gets user info from LDAP as dict using common name for searching"""
+        return self._get_user_info({'cn': name})
+
+
+    def get_user_info_by_email(self, email):
+        """Gets user info from LDAP as dict using emails for searching"""
+        return self._get_user_info({'mail': email})
+
+
+    def get_absent_emails(self, emails):
+        """Checks users by email in LDAP and returns absent emails"""
+        absent_emails = set()
+        for email in emails:
+            if not self.get_user_info_by_email(email):
+                absent_emails.add(email)
+        return absent_emails
+
+
+def _test():
+    """Test and debug"""
+    ldap = LdapApi()
+
+    emails = ldap.get_user_emails()
+    print(f'\nLDAP emails count: {len(emails)}\n{"; ".join(emails)}')
+
+    emails = ['foo@intel.com']
+
+    for email in emails:
+        info = ldap.get_user_info_by_email(email)
+        if info:
+            print_user_info(info, InfoLevel.PDL)
+        else:
+            print(f'\n{email} - not found')
+
+
+if __name__ == '__main__':
+    _test()
--- a/.github/org_control/requirements-dev.txt
+++ b/.github/org_control/requirements-dev.txt
@@ -0,0 +1 @@
+pylint==2.5.3
--- a/.github/org_control/requirements.txt
+++ b/.github/org_control/requirements.txt
@@ -1 +1,2 @@
 PyGithub==1.51
+ldap3==2.7
--- a/.github/org_control/requirements_dev.txt
+++ b/.github/org_control/requirements_dev.txt
@@ -1 +0,0 @@
-pylint==2.3.0
--- a/.github/workflows/build_doc.yml
+++ b/.github/workflows/build_doc.yml
@@ -3,6 +3,7 @@ on: [push, pull_request]

 jobs:
  Build_Doc:
+    if: github.repository == 'openvinotoolkit/openvino'
    runs-on: ubuntu-20.04
    steps:
      - name: Clone OpenVINO
@@ -38,6 +39,7 @@ jobs:
        working-directory: build

      - name: 'Upload doc'
+        if: github.event_name == 'push'
        uses: actions/upload-artifact@v2
        with:
          name: openvino_doc
--- a/.github/workflows/check_pr_commits.yml
+++ b/.github/workflows/check_pr_commits.yml
@@ -0,0 +1,17 @@
+name: PR Commits
+on: [pull_request]
+
+jobs:
+  Checks:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Clone OpenVINO
+        uses: actions/checkout@v2
+
+      - name: Install dependencies
+        run: python3 -m pip install -r ./.github/org_control/requirements.txt
+
+      - name: PR commits
+        run: python3 ./.github/org_control/check_pr.py --pr=${{ github.event.number }} --check-commits DRY_RUN
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,3 +18,7 @@
 	path = thirdparty/xbyak
 	url = https://github.com/herumi/xbyak.git
 	ignore = dirty
+[submodule "thirdparty/zlib/zlib"]
+	path = thirdparty/zlib/zlib
+	url = https://github.com/madler/zlib.git
+	ignore = dirty
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,6 @@ function(build_ngraph)
    else ()
        ngraph_set(NGRAPH_ADDRESS_SANITIZER OFF)
    endif ()
-    ngraph_set(NGRAPH_PYTHON_BUILD_ENABLE OFF)

    if(ENABLE_TESTS AND NOT ANDROID)
        ngraph_set(NGRAPH_UNIT_TEST_ENABLE ON)
@@ -85,6 +84,12 @@ function(build_ngraph)
        ngraph_set(NGRAPH_THREAD_SANITIZER_ENABLE OFF)
    endif()

+    if(ENABLE_PYTHON)
+        ngraph_set(NGRAPH_PYTHON_BUILD_ENABLE ON)
+    else()
+        ngraph_set(NGRAPH_PYTHON_BUILD_ENABLE OFF)
+    endif()
+
    if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
        ie_add_compiler_flags(-Wno-error=uninitialized -Wno-error=literal-conversion)
    elseif(UNIX)
--- a/cmake/developer_package/IEDevScriptsConfig.cmake
+++ b/cmake/developer_package/IEDevScriptsConfig.cmake
@@ -223,6 +223,7 @@ include(api_validator/api_validator)
 include(vs_version/vs_version)
 include(plugins/plugins)
 include(add_ie_target)
+include(CMakePackageConfigHelpers)

 if(ENABLE_FUZZING)
    enable_fuzzing()
--- a/cmake/developer_package/clang_format/clang_format.cmake
+++ b/cmake/developer_package/clang_format/clang_format.cmake
@@ -23,7 +23,7 @@ if (ENABLE_CLANG_FORMAT)
    endif()
 endif()

-if(ENABLE_CLANG_FORMAT)
+if(ENABLE_CLANG_FORMAT AND NOT TARGET clang_format_check_all)
    add_custom_target(clang_format_check_all)
    add_custom_target(clang_format_fix_all)
    set_target_properties(clang_format_check_all clang_format_fix_all
--- a/cmake/developer_package/compile_flags/os_flags.cmake
+++ b/cmake/developer_package/compile_flags/os_flags.cmake
@@ -211,6 +211,16 @@ set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set(CMAKE_C_VISIBILITY_PRESET hidden)
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)

+function(ie_python_minimal_api target)
+    # pybind11 uses a lot of API which is not a part of minimal python API subset
+    # Ref 1: https://docs.python.org/3.11/c-api/stable.html
+    # Ref 2: https://github.com/pybind/pybind11/issues/1755
+    # target_compile_definitions(${target} PRIVATE Py_LIMITED_API=0x03090000)
+    # if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    #     target_compile_options(${target} PRIVATE "-Wno-unused-variable")
+    # endif()
+endfunction()
+
 if(WIN32)
    ie_add_compiler_flags(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
    ie_add_compiler_flags(/EHsc) # no asynchronous structured exception handling
--- a/cmake/developer_package/cpplint/cpplint.cmake
+++ b/cmake/developer_package/cpplint/cpplint.cmake
@@ -3,15 +3,15 @@
 #

 if(ENABLE_CPPLINT)
-    find_package(Python3 COMPONENTS Interpreter)
+    find_package(PythonInterp 3 QUIET)

-    if(NOT Python3_Interpreter_FOUND)
+    if(NOT PYTHONINTERP_FOUND)
        message(WARNING "Python3 interpreter was not found (required for cpplint check)")
        set(ENABLE_CPPLINT OFF)
    endif()
 endif()

-if(ENABLE_CPPLINT)
+if(ENABLE_CPPLINT AND NOT TARGET cpplint_all)
    add_custom_target(cpplint_all ALL)
    set_target_properties(cpplint_all PROPERTIES FOLDER cpplint)
    set(CPPLINT_ALL_OUTPUT_FILES "" CACHE INTERNAL "All cpplint output files")
@@ -68,6 +68,7 @@ function(add_cpplint_target TARGET_NAME)
                "${output_file}"
            COMMAND
                "${CMAKE_COMMAND}"
+                -D "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}"
                -D "CPPLINT_SCRIPT=${IEDevScripts_DIR}/cpplint/cpplint.py"
                -D "INPUT_FILE=${source_file}"
                -D "OUTPUT_FILE=${output_file}"
--- a/cmake/developer_package/cpplint/cpplint_run.cmake
+++ b/cmake/developer_package/cpplint/cpplint_run.cmake
@@ -25,7 +25,7 @@ set(FILTER "${DEFAULT_FILTER}${CUSTOM_FILTER}")

 execute_process(
    COMMAND
-        python3
+        "${PYTHON_EXECUTABLE}"
        "${CPPLINT_SCRIPT}"
        "--linelength=160"
        "--counting=detailed"
--- a/cmake/developer_package/version.cmake
+++ b/cmake/developer_package/version.cmake
@@ -75,11 +75,6 @@ macro(ie_parse_ci_build_number)
    set(IE_VERSION "${IE_VERSION_MAJOR}.${IE_VERSION_MINOR}.${IE_VERSION_PATCH}")
 endmacro()

-# WA for DL Benchmark
-if(DEFINED ENV{CI_BUILD_NUMBER} AND "$ENV{CI_BUILD_NUMBER}" STREQUAL "1")
-    unset(ENV{CI_BUILD_NUMBER})
-endif()
-
 if (DEFINED ENV{CI_BUILD_NUMBER})
    set(CI_BUILD_NUMBER $ENV{CI_BUILD_NUMBER})
 else()
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -29,10 +29,14 @@ Usage: -DSELECTIVE_BUILD=ON -DSELECTIVE_BUILD_STAT=/path/*.csv" OFF

 ie_option(ENABLE_ERROR_HIGHLIGHT "Highlight errors and warnings during compile time" OFF)

+# Try to find python3
+find_package(PythonLibs 3 QUIET)
+ie_dependent_option (ENABLE_PYTHON "enables ie python bridge build" OFF "PYTHONLIBS_FOUND" OFF)

 #
 # enable or disable output from NGRAPH_DEBUG statements
 #
+
 if(NGRAPH_DEBUG_ENABLE)
    add_definitions(-DNGRAPH_DEBUG_ENABLE)
 endif()
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -52,19 +52,11 @@ set(GST_DOCS_DIR "" CACHE PATH "Path to gst-video-analytics documentation")

 function(build_docs)
    find_package(Doxygen REQUIRED dot)
-    find_package(Python3 COMPONENTS Interpreter)
-    find_package(LATEX)
-
-    if(NOT DOXYGEN_FOUND)
-        message(FATAL_ERROR "Doxygen is required to build the documentation")
-    endif()
-
-    if(NOT Python3_FOUND)
-        message(FATAL_ERROR "Python3 is required to build the documentation")
-    endif()
+    find_package(PythonInterp 3 REQUIRED)
+    find_package(LATEX REQUIRED)

    execute_process(
-        COMMAND ${Python3_EXECUTABLE} -m pip show lxml
+        COMMAND ${PYTHON_EXECUTABLE} -m pip show lxml
        RESULT_VARIABLE PIP_EXIT_CODE
        OUTPUT_QUIET
    )
@@ -73,10 +65,6 @@ function(build_docs)
        message(FATAL_ERROR "lxml package is not installed. Please use \"pip install lxml\".")
    endif()

-    if(NOT LATEX_FOUND)
-        message(FATAL_ERROR "LATEX is required to build the documentation")
-    endif()
-
    set(DOCS_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}")
    set(DOXYGEN_DIR "${OpenVINO_MAIN_SOURCE_DIR}/docs/doxygen")
    set(IE_SOURCE_DIR "${OpenVINO_MAIN_SOURCE_DIR}/inference-engine")
@@ -217,7 +205,7 @@ function(build_docs)

    add_custom_command(TARGET py_api
                       PRE_BUILD
-                       COMMAND ${Python3_EXECUTABLE} ${PYX_FILTER} ${PYTHON_API_OUT}
+                       COMMAND ${PYTHON_EXECUTABLE} ${PYX_FILTER} ${PYTHON_API_OUT}
                       COMMENT "Pre-process Python API")

    # Preprocess docs
@@ -317,8 +305,8 @@ function(build_docs)
    add_custom_command(TARGET preprocess_docs
                       PRE_BUILD
                       ${commands}
-                       COMMAND ${Python3_EXECUTABLE} ${DOXY_LAYOUT_SCRIPT} --openvino ${OPENVINO_LAYOUT_BUILD}
-                       COMMAND ${Python3_EXECUTABLE} ${DOXY_MD_FILTER} ${DOCS_BUILD_DIR}
+                       COMMAND ${PYTHON_EXECUTABLE} ${DOXY_LAYOUT_SCRIPT} --openvino ${OPENVINO_LAYOUT_BUILD}
+                       COMMAND ${PYTHON_EXECUTABLE} ${DOXY_MD_FILTER} ${DOCS_BUILD_DIR}
                       COMMENT "Pre-process markdown and image links")

    # IE dev guide and C++ API
@@ -353,7 +341,7 @@ function(build_docs)

    add_custom_command(TARGET openvino_docs
                       POST_BUILD
-                       COMMAND ${Python3_EXECUTABLE} ${DOXY_LOG_SCRIPT} --log "${DOCS_BUILD_DIR}/ie_docs.log"
+                       COMMAND ${PYTHON_EXECUTABLE} ${DOXY_LOG_SCRIPT} --log "${DOCS_BUILD_DIR}/ie_docs.log"
                                                                        --include_omz $<BOOL:${OMZ_DOCS_DIR}>
                                                                        --include_wb $<BOOL:${WORKBENCH_DOCS_DIR}>
                                                                        --include_pot $<BOOL:${POT_DOCS_DIR}>
@@ -365,7 +353,7 @@ function(build_docs)

    if(EXISTS "${LINKCHECKER_PY}")
        add_custom_target(docs_check
-                            COMMAND ${Python3_EXECUTABLE} "${LINKCHECKER_PY}" -v "${DOCS_BUILD_DIR}/html/"
+                            COMMAND ${PYTHON_EXECUTABLE} "${LINKCHECKER_PY}" -v "${DOCS_BUILD_DIR}/html/"
                            COMMENT "Check links in generated documentation"
                            WORKING_DIRECTORY "${DOCS_BUILD_DIR}"
                            VERBATIM)
--- a/docs/IE_DG/supported_plugins/GNA.md
+++ b/docs/IE_DG/supported_plugins/GNA.md
@@ -51,7 +51,7 @@ Intel® Core™ i3-8121U Processor
 Intel® GNA hardware requires a driver to be installed on the system.

 * Linux\* OS:
-[Download Intel® GNA driver for Ubuntu Linux 18.04.3 LTS (with HWE Kernel version 5.0+)](https://download.01.org/opencv/drivers/gna/)
+[Download Intel® GNA driver for Ubuntu Linux 18.04.3 LTS (with HWE Kernel version 5.4+)](https://storage.openvinotoolkit.org/drivers/gna/)

 * Windows\* OS:
 Intel® GNA driver for Windows is available through Windows Update\*
--- a/docs/MO_DG/prepare_model/Supported_Frameworks_Layers.md
+++ b/docs/MO_DG/prepare_model/Supported_Frameworks_Layers.md
@@ -500,6 +500,7 @@ Standard ONNX\* operators:
 | Sigmoid | No |
 | Sign | No |
 | Sin | No |
+| Size | No |
 | Slice | No |
 | Softmax | No |
 | Softplus | No |
--- a/docs/install_guides/installing-openvino-docker-linux.md
+++ b/docs/install_guides/installing-openvino-docker-linux.md
@@ -2,7 +2,7 @@

 The Intel® Distribution of OpenVINO™ toolkit quickly deploys applications and solutions that emulate human vision. Based on Convolutional Neural Networks (CNN), the toolkit extends computer vision (CV) workloads across Intel® hardware, maximizing performance. The Intel® Distribution of OpenVINO™ toolkit includes the Intel® Deep Learning Deployment Toolkit.  

-This guide provides the steps for creating a Docker* image with Intel® Distribution of OpenVINO™ toolkit for Linux* and further installation.  
+This guide provides device specifics for a Docker* image creation with Intel® Distribution of OpenVINO™ toolkit for Linux* and its further usage.  

 ## System Requirements

@@ -10,25 +10,31 @@ This guide provides the steps for creating a Docker* image with Intel® Distribu

 - Ubuntu\* 18.04 long-term support (LTS), 64-bit
 - Ubuntu\* 20.04 long-term support (LTS), 64-bit
- CentOS\* 7.6
- Red Hat* Enterprise Linux* 8.2 (64 bit)
+- CentOS\* 7
+- Red Hat\* Enterprise Linux* 8 (64 bit)

 **Host Operating Systems**

- Linux with installed GPU driver and with Linux kernel supported by GPU driver
+- Linux

 ## Prebuilt images

-Prebuilt images are available on: 
+Prebuilt images are available on:
+
 - [Docker Hub](https://hub.docker.com/u/openvino)
 - [Red Hat* Quay.io](https://quay.io/organization/openvino)
 - [Red Hat* Ecosystem Catalog](https://catalog.redhat.com/software/containers/intel/openvino-runtime/606ff4d7ecb5241699188fb3)

+## Build a Docker* Image
+
+You can use [available Dockerfiles](https://github.com/openvinotoolkit/docker_ci/tree/master/dockerfiles) or generate a Dockerfile with your setting via [DockerHub CI Framework](https://github.com/openvinotoolkit/docker_ci). The Framework can generate a Dockerfile, build, test, and deploy an image with the Intel® Distribution of OpenVINO™ toolkit.
+You can also try our [Tutorials](https://github.com/openvinotoolkit/docker_ci/tree/master/docs/tutorials) which demonstrate the usage of OpenVINO™ Docker containers.
+
 ## Use Docker* Image for CPU

 - Kernel reports the same information for all containers as for native application, for example, CPU, memory information.
 - All instructions that are available to host process available for process in container, including, for example, AVX2, AVX512. No restrictions.
- Docker* does not use virtualization or emulation. The process in Docker* is just a regular Linux process, but it is isolated from external world on kernel level. Performance penalty is small.
+- Docker\* does not use virtualization or emulation. The process in Docker* is just a regular Linux process, but it is isolated from external world on kernel level. Performance penalty is small.

 ### <a name="building-for-cpu"></a>Build a Docker* Image for CPU

@@ -155,7 +161,6 @@ ARG BUILD_DEPENDENCIES="autoconf \
                        unzip \
                        udev"

-# hadolint ignore=DL3031, DL3033
 RUN yum update -y && yum install -y ${BUILD_DEPENDENCIES} && \
    yum group install -y "Development Tools" && \
    yum clean all && rm -rf /var/cache/yum
@@ -248,12 +253,14 @@ $HDDL_INSTALL_DIR/hddldaemon
 ```

 ### Run the Docker* Image for Intel® Vision Accelerator Design with Intel® Movidius™ VPUs
+
 To run the built Docker* image for Intel® Vision Accelerator Design with Intel® Movidius™ VPUs, use the following command:
 ```sh
 docker run -it --rm --device=/dev/ion:/dev/ion -v /var/tmp:/var/tmp <image_name>
 ```

 > **NOTES**:
+> 
 > - The device `/dev/ion` need to be shared to be able to use ion buffers among the plugin, `hddldaemon` and the kernel.
 > - Since separate inference tasks share the same HDDL service communication interface (the service creates mutexes and a socket file in `/var/tmp`), `/var/tmp` needs to be mounted and shared among them.

@@ -262,6 +269,7 @@ In some cases, the ion driver is not enabled (for example, due to a newer kernel
 docker run -it --rm --net=host -v /var/tmp:/var/tmp –ipc=host <image_name>
 ```
 > **NOTES**:
+> 
 > - When building docker images, create a user in the docker file that has the same UID and GID as the user which runs hddldaemon on the host.
 > - Run the application in the docker with this user.
 > - Alternatively, you can start hddldaemon with the root user on host, but this approach is not recommended.
@@ -310,10 +318,6 @@ If you got proxy issues, please setup proxy settings for Docker. See the Proxy s

 * [DockerHub CI Framework](https://github.com/openvinotoolkit/docker_ci) for Intel® Distribution of OpenVINO™ toolkit. The Framework can generate a Dockerfile, build, test, and deploy an image with the Intel® Distribution of OpenVINO™ toolkit. You can reuse available Dockerfiles, add your layer and customize the image of OpenVINO™ for your needs.

-* Intel® Distribution of OpenVINO™ toolkit home page: [https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)  
-
-* OpenVINO™ toolkit documentation: [https://docs.openvinotoolkit.org](https://docs.openvinotoolkit.org)
+* Intel® Distribution of OpenVINO™ toolkit home page: [https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)

 * Intel® Neural Compute Stick 2 Get Started: [https://software.intel.com/en-us/neural-compute-stick/get-started](https://software.intel.com/en-us/neural-compute-stick/get-started)
-
-* Intel® Distribution of OpenVINO™ toolkit Docker Hub* home page: [https://hub.docker.com/u/openvino](https://hub.docker.com/u/openvino)
--- a/docs/install_guides/installing-openvino-docker-windows.md
+++ b/docs/install_guides/installing-openvino-docker-windows.md
@@ -2,7 +2,7 @@

 The Intel® Distribution of OpenVINO™ toolkit quickly deploys applications and solutions that emulate human vision. Based on Convolutional Neural Networks (CNN), the toolkit extends computer vision (CV) workloads across Intel® hardware, maximizing performance. The Intel® Distribution of OpenVINO™ toolkit includes the Intel® Deep Learning Deployment Toolkit.  

-This guide provides the steps for creating a Docker* image with Intel® Distribution of OpenVINO™ toolkit for Windows* and further installation.
+This guide provides device specifics for a Docker* image creation with Intel® Distribution of OpenVINO™ toolkit for Linux* and its further usage.

 ## System Requirements

@@ -13,19 +13,22 @@ This guide provides the steps for creating a Docker* image with Intel® Distribu
 **Host Operating Systems**

 - Windows 10*, 64-bit Pro, Enterprise or Education (1607 Anniversary Update, Build 14393 or later) editions
- Windows Server* 2016 or higher 
+- Windows Server* 2016 or higher

 ## Prebuilt Images

 Prebuilt images are available on [Docker Hub](https://hub.docker.com/u/openvino).

-## Build a Docker* Image for CPU
+## Build a Docker* Image

-You can use [available Dockerfiles](https://github.com/openvinotoolkit/docker_ci/tree/master/dockerfiles) or generate a Dockerfile with your setting via [DockerHub CI Framework](https://github.com/openvinotoolkit/docker_ci) for Intel® Distribution of OpenVINO™ toolkit. 
-The Framework can generate a Dockerfile, build, test, and deploy an image with the Intel® Distribution of OpenVINO™ toolkit. 
+You can use [available Dockerfiles](https://github.com/openvinotoolkit/docker_ci/tree/master/dockerfiles) or generate a Dockerfile with your setting via [DockerHub CI Framework](https://github.com/openvinotoolkit/docker_ci). The Framework can generate a Dockerfile, build, test, and deploy an image with the Intel® Distribution of OpenVINO™ toolkit.
+
+## Build and Run the Docker* Image for CPU

 ## Install Additional Dependencies
+
 ### Install CMake
+
 To add CMake to the image, add the following commands to the Dockerfile:
 ~~~
 RUN powershell.exe -Command `
@@ -42,6 +45,7 @@ docker build . -t <image_name> `
 ~~~

 ### Install Microsoft Visual Studio* Build Tools
+
 You can add Microsoft Visual Studio Build Tools* to a Windows* OS Docker image. Available options are to use offline installer for Build Tools 
 (follow the [Instruction for the offline installer](https://docs.microsoft.com/en-us/visualstudio/install/create-an-offline-installation-of-visual-studio?view=vs-2019)) or 
 to use the online installer for Build Tools (follow [Instruction for the online installer](https://docs.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2019)).
@@ -79,6 +83,7 @@ docker run -itu ContainerAdministrator --rm <image_name> cmd /S /C "cd deploymen
 ## Build and Run the Docker* Image for GPU

 GPU Acceleration in Windows containers feature requires to meet Windows host, OpenVINO toolkit and Docker* requirements:
+
 * [Windows requirements](https://docs.microsoft.com/en-us/virtualization/windowscontainers/deploy-containers/gpu-acceleration):
  * The container host must be running Windows Server 2019 or Windows 10 of version 1809 or higher.
  * The container base image must be `mcr.microsoft.com/windows:1809` or higher. Windows Server Core and Nano Server container images are not currently supported.
@@ -142,8 +147,4 @@ If you got proxy issues, please setup proxy settings for Docker. See the Proxy s

 * [DockerHub CI Framework](https://github.com/openvinotoolkit/docker_ci) for Intel® Distribution of OpenVINO™ toolkit. The Framework can generate a Dockerfile, build, test, and deploy an image with the Intel® Distribution of OpenVINO™ toolkit. You can reuse available Dockerfiles, add your layer and customize the image of OpenVINO™ for your needs.

-* Intel® Distribution of OpenVINO™ toolkit home page: [https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)  
-
-* OpenVINO™ toolkit documentation: [https://docs.openvinotoolkit.org](https://docs.openvinotoolkit.org)
-
-* Intel® Distribution of OpenVINO™ toolkit Docker Hub* home page: [https://hub.docker.com/u/openvino](https://hub.docker.com/u/openvino)
+* Intel® Distribution of OpenVINO™ toolkit home page: [https://software.intel.com/en-us/openvino-toolkit](https://software.intel.com/en-us/openvino-toolkit)
--- a/docs/ops/condition/NonZero_3.md
+++ b/docs/ops/condition/NonZero_3.md
@@ -7,31 +7,35 @@
 **Short description**: *NonZero* returns the indices of the non-zero elements of the input tensor.

 **Detailed description**: *NonZero* returns the indices of the non-zero elements of the input tensor (in row-major order - by dimension).
-The output tensor has shape `[rank(input), num_non_zero]`. For example, for the tensor `[[1, 0], [1, 1]]` the output will be `[[0, 1, 1], [0, 0, 1]]`.
+* The output tensor has shape `[rank(input), num_non_zero]`.
+* For example, for the tensor `[[1, 0], [1, 1]]` the output will be `[[0, 1, 1], [0, 0, 1]]`.
+* The output is a collection of tuples, each tuple has `rank(input)` elements and contains indices for a single non-zero element.
+* The `i`'th element of each output dimension is a part of `i`'th tuple.
+* In given example the tuples would be: `[0, 0]`, `[1, 0]`, `[1, 1]`.

 **Attributes**

 * *output_type*

  * **Description**: the output tensor type
-  * **Range of values**: "i64" or "i32"
+  * **Range of values**: `i64` or `i32`
  * **Type**: string
  * **Default value**: "i64"
  * **Required**: *No*

 **Inputs**:

-*   **1**: `data` tensor of arbitrary rank of type *T*. Required.
+*   **1**: A tensor of type *T* and arbitrary shape. **Required**.

 **Outputs**:

-*   **1**: tensor with indices of non-zero elements of shape `[rank(data), num_non_zero]` of type *T_IND*.
+*   **1**: tensor with indices of non-zero elements of shape `[rank(data), num_non_zero]` of type *T_OUT*.

 **Types**

 * *T*: any type.

-* *T_IND*: `int64` or `int32`.
+* *T_OUT*: Depending on *output_type* attribute can be `int64` or `int32`.

 **Example**

@@ -53,4 +57,4 @@ The output tensor has shape `[rank(input), num_non_zero]`. For example, for the
        </port>
    </output>
 </layer>
-```
+```
--- a/docs/ops/detection/RegionYolo_1.md
+++ b/docs/ops/detection/RegionYolo_1.md
@@ -6,7 +6,7 @@

 **Short description**: *RegionYolo* computes the coordinates of regions with probability for each class.

-**Detailed description**: This operation is directly mapped to the original YOLO layer. [Reference](https://arxiv.org/pdf/1612.08242.pdf)
+**Detailed description**: This operation is directly mapped to the [YOLO9000: Better, Faster, Stronger](https://arxiv.org/pdf/1612.08242.pdf) paper. 

 **Attributes**:

@@ -78,14 +78,17 @@

 **Inputs**:

-*   **1**: `data` - 4D input tensor with floating point elements and shape `[N, C, H, W]`. Required.
+*   **1**: `data` - 4D tensor of type `T` and shape `[N, C, H, W]`. **Required.**

 **Outputs**:

-*   **1**: output tensor of rank 4 or less that codes detected regions. Refer to the original YOLO paper to decode the output as boxes. `anchors` should be used to decode real box coordinates. If `do_softmax` is set to 0, then the output shape is `[N, (classes + coords + 1)*len(mask), H, W]`. If `do_softmax` is set to 1, then output shape is partially flattened and defined in the following way:
+*   **1**: tensor of type `T` and rank 4 or less that codes detected regions. Refer to the [YOLO9000: Better, Faster, Stronger](https://arxiv.org/pdf/1612.08242.pdf) paper to decode the output as boxes. `anchors` should be used to decode real box coordinates. If `do_softmax` is set to `0`, then the output shape is `[N, (classes + coords + 1) * len(mask), H, W]`. If `do_softmax` is set to `1`, then output shape is partially flattened and defined in the following way:

-    flat_dim = data.shape[axis] * data.shape[axis+1] * ... * data.shape[end_axis]
-    output.shape = [data.shape[0], ..., data.shape[axis-1], flat_dim, data.shape[end_axis + 1], ...]
+    `flat_dim = data.shape[axis] * data.shape[axis+1] * ... * data.shape[end_axis]`  
+    `output.shape = [data.shape[0], ..., data.shape[axis-1], flat_dim, data.shape[end_axis + 1], ...]`
+
+**Types**
+* *T*: any supported floating point type.

 **Example**

--- a/inference-engine/CMakeLists.txt
+++ b/inference-engine/CMakeLists.txt
@@ -21,7 +21,7 @@ function(ie_developer_export)
    set(all_dev_targets gflags ie_libraries)
    foreach(component IN LISTS openvino_export_components)
        export(TARGETS ${${component}} NAMESPACE IE::
-            APPEND FILE "${CMAKE_BINARY_DIR}/${component}_dev_targets.cmake")
+               APPEND FILE "${CMAKE_BINARY_DIR}/${component}_dev_targets.cmake")
        list(APPEND all_dev_targets ${${component}})
    endforeach()

@@ -72,6 +72,18 @@ endif()

 ie_cpack_add_component(cpp_samples DEPENDS core)

+install(DIRECTORY ../thirdparty/zlib
+        DESTINATION ${IE_CPACK_IE_DIR}/samples/cpp/thirdparty
+        COMPONENT cpp_samples
+        USE_SOURCE_PERMISSIONS
+        PATTERN .clang-format EXCLUDE)
+
+install(DIRECTORY ../thirdparty/cnpy
+        DESTINATION ${IE_CPACK_IE_DIR}/samples/cpp/thirdparty
+        COMPONENT cpp_samples
+        USE_SOURCE_PERMISSIONS
+        PATTERN .clang-format EXCLUDE)
+
 if(UNIX)
    install(DIRECTORY samples/
            DESTINATION ${IE_CPACK_IE_DIR}/samples/cpp
@@ -131,24 +143,30 @@ endif()
 #

 openvino_developer_export_targets(COMPONENT openvino_common TARGETS format_reader gflags ie_samples_utils)
-openvino_developer_export_targets(COMPONENT ngraph TARGETS ${NGRAPH_LIBRARIES})

 # for Template plugin
 if(NGRAPH_INTERPRETER_ENABLE)
    openvino_developer_export_targets(COMPONENT ngraph TARGETS ngraph_backend interpreter_backend)
 endif()

-ie_developer_export()
+function(ie_generate_dev_package_config)
+    # dummy check that OpenCV is here
+    find_package(OpenCV QUIET)

-configure_file(
-    "${IE_MAIN_SOURCE_DIR}/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in"
-    "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig.cmake"
-    @ONLY)
+    ie_developer_export()

-configure_file(
-    "${IE_MAIN_SOURCE_DIR}/cmake/templates/InferenceEngineConfig-version.cmake.in"
-    "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig-version.cmake"
-    @ONLY)
+    configure_package_config_file("${InferenceEngine_SOURCE_DIR}/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in"
+                                "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig.cmake"
+                                INSTALL_DESTINATION share # not used
+                                PATH_VARS "OpenVINO_MAIN_SOURCE_DIR;IE_MAIN_SOURCE_DIR;gflags_BINARY_DIR"
+                                NO_CHECK_REQUIRED_COMPONENTS_MACRO)
+
+    configure_file("${IE_MAIN_SOURCE_DIR}/cmake/templates/InferenceEngineConfig-version.cmake.in"
+                "${CMAKE_BINARY_DIR}/InferenceEngineDeveloperPackageConfig-version.cmake"
+                @ONLY)
+endfunction()
+
+ie_generate_dev_package_config()

 #
 # Coverage
@@ -163,6 +181,10 @@ endif()
 #

 function(register_extra_modules)
+    # post export
+    ie_developer_export_targets(inference_engine)
+    openvino_developer_export_targets(COMPONENT ngraph TARGETS ${NGRAPH_LIBRARIES})
+
    set(InferenceEngineDeveloperPackage_DIR "${CMAKE_CURRENT_BINARY_DIR}/build-modules")

    function(generate_fake_dev_package)
--- a/inference-engine/cmake/features.cmake
+++ b/inference-engine/cmake/features.cmake
@@ -66,8 +66,6 @@ ie_dependent_option (ENABLE_SPEECH_DEMO "enable speech demo integration" ON "NOT

 ie_option (ENABLE_OPENCV "enables OpenCV" ON)

-ie_option (ENABLE_PYTHON "enables ie python bridge build" OFF)
-
 ie_option (ENABLE_V7_SERIALIZE "enables serialization to IR v7" OFF)

 set(IE_EXTRA_MODULES "" CACHE STRING "Extra paths for extra modules to include into OpenVINO build")
--- a/inference-engine/cmake/templates/InferenceEngineConfig-version.cmake.in
+++ b/inference-engine/cmake/templates/InferenceEngineConfig-version.cmake.in
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2020 Intel Corporation
+# Copyright (C) 2018-2021 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #

--- a/inference-engine/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in
+++ b/inference-engine/cmake/templates/InferenceEngineDeveloperPackageConfig.cmake.in
@@ -2,9 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0
 #

+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
 # TODO: remove after changing [private plugins]
-set(OpenVINO_MAIN_SOURCE_DIR "@OpenVINO_MAIN_SOURCE_DIR@") # KMB
-set(IE_MAIN_SOURCE_DIR "@IE_MAIN_SOURCE_DIR@") # HDDL
+set_and_check(OpenVINO_MAIN_SOURCE_DIR "@OpenVINO_MAIN_SOURCE_DIR@") # KMB
+set_and_check(IE_MAIN_SOURCE_DIR "@IE_MAIN_SOURCE_DIR@") # HDDL

 # Variables to export in plugin's projects

@@ -22,32 +26,46 @@ endforeach()
 message("")

 # for samples in 3rd party projects
-set(gflags_DIR "@gflags_BINARY_DIR@")
+set_and_check(gflags_DIR "@gflags_BINARY_DIR@")

-# Targets
+#
+# Content
+#

-if(USE_SYSTEM_PUGIXML)
-    find_package(PugiXML REQUIRED)
-    set_property(TARGET pugixml PROPERTY IMPORTED_GLOBAL TRUE)
-endif()
+find_dependency(IEDevScripts
+                PATHS "${OpenVINO_MAIN_SOURCE_DIR}/cmake/developer_package"
+                NO_CMAKE_FIND_ROOT_PATH
+                NO_DEFAULT_PATH)
+
+find_dependency(InferenceEngine
+                PATHS "${CMAKE_CURRENT_LIST_DIR}"
+                NO_CMAKE_FIND_ROOT_PATH
+                NO_DEFAULT_PATH)
+
+# WA for cmake: it exports ngraph as IE::ngraph in the IE export list
+# while we already have ngraph export in its own export list as ngraph::ngraph
+set_property(TARGET ngraph::ngraph PROPERTY IMPORTED_GLOBAL TRUE)
+add_library(IE::ngraph ALIAS ngraph::ngraph)

 foreach(component @openvino_export_components@)
    include("${CMAKE_CURRENT_LIST_DIR}/${component}_dev_targets.cmake")
 endforeach()

-set(InferenceEngine_LIBRARIES IE::inference_engine)
+if(USE_SYSTEM_PUGIXML)
+    find_dependency(PugiXML)
+    set_property(TARGET pugixml PROPERTY IMPORTED_GLOBAL TRUE)
+endif()
+
+# inherit OpenCV from main IE project if enabled
+if ("@OpenCV_FOUND@")
+    load_cache("${cache_path}" READ_WITH_PREFIX "" OpenCV_DIR)
+    find_dependency(OpenCV)
+endif()

 #
-# Common cmake includes
+# Extra Compile Flags
 #

-# Inference Engine Developer Scripts package
-
-find_package(IEDevScripts REQUIRED
-             PATHS "@OpenVINO_MAIN_SOURCE_DIR@/cmake/developer_package"
-             NO_CMAKE_FIND_ROOT_PATH
-             NO_DEFAULT_PATH)
-
 if(NOT MSVC)
    ie_add_compiler_flags(-Wno-error=unused-variable)
    if(CMAKE_COMPILER_IS_GNUCXX)
@@ -57,15 +75,3 @@ endif()

 # Don't threat deprecated API warnings as errors in 3rd party apps
 ie_deprecated_no_errors()
-
-# inherit OpenCV from main IE project if enabled
-if (ENABLE_OPENCV)
-    load_cache("${cache_path}" READ_WITH_PREFIX "" OpenCV_DIR)
-    find_package(OpenCV)
-endif()
-
-# inherit TBB from main IE project if enabled
-if (THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO")
-    load_cache("${cache_path}" READ_WITH_PREFIX "" TBB_DIR)
-    find_package(TBB)
-endif()
--- a/inference-engine/ie_bridges/python/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/CMakeLists.txt
@@ -8,18 +8,6 @@ cmake_minimum_required (VERSION 3.13)
 # Set the project name
 project (ie_python_api)

-option(ENABLE_CONDA_FOLDER "Create output folder with conda python bindings" OFF)
-option(ENABLE_WHEEL "Create wheel package" OFF)
-
-set(PYTHON_BRIDGE_CPACK_PATH "python")
-
-string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} ARCH)
-if(ARCH STREQUAL "x86_64" OR ARCH STREQUAL "amd64") # Windows detects Intel's 64-bit CPU as AMD64
-    set(ARCH intel64)
-elseif(ARCH STREQUAL "i386")
-    set(ARCH ia32)
-endif()
-
 if(DEFINED IE_MAIN_SOURCE_DIR)
    set(InferenceEngine_LIBRARIES inference_engine)
 else()
@@ -27,6 +15,12 @@ else()
    set(InferenceEngine_LIBRARIES IE::inference_engine)
 endif()

+option(ENABLE_CONDA_FOLDER "Create output folder with conda python bindings" OFF)
+cmake_dependent_option(ENABLE_WHEEL "Create wheel package" OFF
+    "PYTHONINTERP_FOUND;NOT CMAKE_SOURCE_DIR STREQUAL ie_python_api_SOURCE_DIR" OFF)
+
+set(PYTHON_BRIDGE_CPACK_PATH "python")
+
 if(UNIX)
    # cython generated files requires public visibility. Force visibility required.
    set(CMAKE_CXX_VISIBILITY_PRESET default)
@@ -35,10 +29,17 @@ endif()

 include (cmake/UseCython.cmake)

-if(PYTHONINTERP_FOUND)
-    set(PYTHON_VERSION python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR})
+# Check Cython version
+if(CYTHON_VERSION VERSION_LESS "0.29")
+    message(FATAL_ERROR "OpenVINO Python API needs at least Cython version 0.29, found version ${CYTHON_VERSION}")
 else()
-    message(FATAL_ERROR "Python Interpretator was not found!")
+    message(STATUS "Found Cython version ${CYTHON_VERSION}")
+endif()
+
+if(PYTHONLIBS_VERSION_STRING MATCHES "^([0-9]+)\.([0-9]+).*")
+    set(PYTHON_VERSION python${CMAKE_MATCH_1}.${CMAKE_MATCH_2})
+else()
+    message(FATAL_ERROR "Failed to extract python major.minor from ${PYTHONLIBS_VERSION_STRING}")
 endif()

 if(ENABLE_CONDA_FOLDER)
@@ -67,13 +68,6 @@ if(ENABLE_WHEEL)
    add_subdirectory(wheel)
 endif()

-# Check Cython version
-if(CYTHON_VERSION VERSION_LESS "0.29")
-    message(FATAL_ERROR "OpenVINO Python API needs at least Cython version 0.29, found version ${CYTHON_VERSION}")
-else()
-    message(STATUS "Found Cython version ${CYTHON_VERSION}")
-endif()
-
 # install

 ie_cpack_add_component(${PYTHON_VERSION})
--- a/inference-engine/ie_bridges/python/cmake/CythonConfig.cmake
+++ b/inference-engine/ie_bridges/python/cmake/CythonConfig.cmake
@@ -29,15 +29,16 @@
 # See also UseCython.cmake
 # Use the Cython executable that lives next to the Python executable
 # if it is a local installation.
-find_package( PythonInterp )
+
+find_package(PythonInterp 3 QUIET)
 if( PYTHONINTERP_FOUND )
  get_filename_component( _python_path ${PYTHON_EXECUTABLE} PATH )
-  find_program( CYTHON_EXECUTABLE
+  find_host_program( CYTHON_EXECUTABLE
    NAMES cython cython.bat cython3
    HINTS ${_python_path} $ENV{HOME}/.local/bin
    )
 else()
-  find_program( CYTHON_EXECUTABLE
+  find_host_program( CYTHON_EXECUTABLE
    NAMES cython cython.bat cython3
    )
 endif()
--- a/inference-engine/ie_bridges/python/cmake/UseCython.cmake
+++ b/inference-engine/ie_bridges/python/cmake/UseCython.cmake
@@ -13,10 +13,6 @@
 #
 #   cython_add_module( <module_name> <src1> <src2> ... <srcN> )
 #
-# To create a standalone executable, the function
-#
-#   cython_add_standalone_executable( <executable_name> [MAIN_MODULE src1] <src1> <src2> ... <srcN> )
-#
 # To avoid dependence on Python, set the PYTHON_LIBRARY cache variable to point
 # to a static library.  If a MAIN_MODULE source is specified,
 # the "if __name__ == '__main__':" from that module is used as the C main() method
@@ -92,7 +88,7 @@ find_package( Cython REQUIRED
              PATHS "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
              NO_CMAKE_FIND_ROOT_PATH
              NO_DEFAULT_PATH )
-find_package( PythonLibs REQUIRED )
+find_package(PythonLibs 3 REQUIRED)

 set( CYTHON_CXX_EXTENSION "cxx" )
 set( CYTHON_C_EXTENSION "c" )
@@ -239,9 +235,7 @@ function( compile_pyx _name generated_file )
      set( cython_debug_arg "--gdb" )
  endif()

-  if( "${PYTHONLIBS_VERSION_STRING}" MATCHES "^2." )
-    set( version_arg "-2" )
-  elseif( "${PYTHONLIBS_VERSION_STRING}" MATCHES "^3." )
+  if( "${PYTHONLIBS_VERSION_STRING}" MATCHES "^3." )
    set( version_arg "-3" )
  else()
    set( version_arg )
@@ -292,48 +286,12 @@ function( cython_add_module _name )
    endif()
  endforeach()
  compile_pyx( ${_name} generated_file ${pyx_module_sources} )
-  include_directories( ${PYTHON_INCLUDE_DIRS} )
-  python_add_module( ${_name} ${generated_file} ${other_module_sources} )
+  python_add_module ( ${_name} ${generated_file} ${other_module_sources} )
+  target_include_directories( ${_name} PRIVATE ${PYTHON_INCLUDE_DIRS})
+  # set_target_properties(${_name} PROPERTIES PREFIX "" SUFFIX "${PYTHON_MODULE_EXTENSION}")
  if( APPLE )
    set_target_properties( ${_name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup" )
  else()
    target_link_libraries( ${_name} PRIVATE ${PYTHON_LIBRARIES} )
  endif()
 endfunction()
-
-include( CMakeParseArguments )
-# cython_add_standalone_executable( _name [MAIN_MODULE src3.py] src1 src2 ... srcN )
-# Creates a standalone executable the given sources.
-function( cython_add_standalone_executable _name )
-  set( pyx_module_sources "" )
-  set( other_module_sources "" )
-  set( main_module "" )
-  cmake_parse_arguments( cython_arguments "" "MAIN_MODULE" "" ${ARGN} )
-  include_directories( ${PYTHON_INCLUDE_DIRS} )
-  foreach( _file ${cython_arguments_UNPARSED_ARGUMENTS} )
-    if( ${_file} MATCHES ".*\\.py[x]?$" )
-      get_filename_component( _file_we ${_file} NAME_WE )
-      if( "${_file_we}" STREQUAL "${_name}" )
-        set( main_module "${_file}" )
-      elseif( NOT "${_file}" STREQUAL "${cython_arguments_MAIN_MODULE}" )
-        set( PYTHON_MODULE_${_file_we}_static_BUILD_SHARED OFF )
-        compile_pyx( "${_file_we}_static" generated_file "${_file}" )
-        list( APPEND pyx_module_sources "${generated_file}" )
-      endif()
-    else()
-      list( APPEND other_module_sources ${_file} )
-    endif()
-  endforeach()
-
-  if( cython_arguments_MAIN_MODULE )
-    set( main_module ${cython_arguments_MAIN_MODULE} )
-  endif()
-  if( NOT main_module )
-    message( FATAL_ERROR "main module not found." )
-  endif()
-  get_filename_component( main_module_we "${main_module}" NAME_WE )
-  set( CYTHON_FLAGS ${CYTHON_FLAGS} --embed )
-  compile_pyx( "${main_module_we}_static" generated_file ${main_module} )
-  add_executable( ${_name} ${generated_file} ${pyx_module_sources} ${other_module_sources} )
-  target_link_libraries( ${_name} PRIVATE ${PYTHON_LIBRARIES} ${pyx_module_libs} )
-endfunction()
--- a/inference-engine/ie_bridges/python/requirements.txt
+++ b/inference-engine/ie_bridges/python/requirements.txt
@@ -1 +1 @@
-numpy~=1.19.5
+numpy>=1.16.6,<1.20
--- a/inference-engine/ie_bridges/python/sample/requirements.txt
+++ b/inference-engine/ie_bridges/python/sample/requirements.txt
@@ -1,2 +1,2 @@
 opencv-python==4.5.*
-numpy~=1.19.5
+numpy>=1.16.6,<1.20
--- a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
@@ -9,29 +9,29 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_e
 set(CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_engine)
 set(CMAKE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_engine)

-file(GLOB SOURCE
-        ${CMAKE_CURRENT_SOURCE_DIR}/ie_api.pyx
-        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/ie_api.pyx
+                  ${CMAKE_CURRENT_SOURCE_DIR}/*.pxd
+                  ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+                  ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)

-set_source_files_properties(${SOURCE} PROPERTIES CYTHON_IS_CXX ON)
+file(GLOB PYX_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.pyx)
+set_source_files_properties(${PYX_SOURCES} PROPERTIES CYTHON_IS_CXX ON)

 # create target

-cython_add_module(${TARGET_NAME} ${SOURCE})
+cython_add_module(${TARGET_NAME} ${SOURCES})
 set(INSTALLED_TARGETS ${TARGET_NAME})

-file(GLOB OTHER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.pyx)
-list(REMOVE_ITEM OTHER_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/ie_api.pyx")
+list(REMOVE_ITEM PYX_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/ie_api.pyx")

-foreach(PYX_FILE ${OTHER_SOURCES})
+foreach(PYX_FILE IN LISTS PYX_SOURCES)
    get_filename_component(PYX_NAME "${PYX_FILE}" NAME_WE)
-    set_source_files_properties(${PYX_FILE} PROPERTIES CYTHON_IS_CXX ON)
    cython_add_module(${PYX_NAME} ${PYX_FILE})
    add_dependencies(${TARGET_NAME} ${PYX_NAME})
    target_include_directories(${PYX_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
    target_link_libraries(${PYX_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
    list(APPEND INSTALLED_TARGETS ${PYX_NAME})
+    ie_python_minimal_api(${PYX_NAME})
 endforeach()

 if(COMMAND ie_add_vs_version_file)
@@ -48,6 +48,7 @@ function(python_disable_deprecated_warnings)
 endfunction()

 python_disable_deprecated_warnings()
+ie_python_minimal_api(${TARGET_NAME})

 target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
 target_link_libraries(${TARGET_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
@@ -60,7 +61,7 @@ endif()
 # perform copy
 add_custom_command(TARGET ${TARGET_NAME}
    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/inference_engine/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py
    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/requirements.txt ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../../requirements.txt
    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/requirements.txt ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/../../requirements.txt
    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../__init__.py
--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt
@@ -9,24 +9,22 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_tra
 set(CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_transformations)
 set(CMAKE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_transformations)

-file(GLOB SOURCE
-        ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api.pyx
-        ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+set(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl_defs.pxd
+            ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api.pyx
+            ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl.hpp
+            ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl.cpp)

-set_source_files_properties(${SOURCE} PROPERTIES CYTHON_IS_CXX ON)
+set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api.pyx
+                            PROPERTIES CYTHON_IS_CXX ON)

 # create target

-cython_add_module(${TARGET_NAME} ${SOURCE})
-set(INSTALLED_TARGETS ${TARGET_NAME})
+cython_add_module(${TARGET_NAME} ${SOURCES})
 add_dependencies(${TARGET_NAME} ie_api)

 if(COMMAND ie_add_vs_version_file)
-    foreach(target IN LISTS INSTALLED_TARGETS)
-        ie_add_vs_version_file(NAME ${target}
-                               FILEDESCRIPTION "Offline Transformatoins Python library")
-    endforeach()
+    ie_add_vs_version_file(NAME ${TARGET_NAME}
+                           FILEDESCRIPTION "Offline Transformatoins Python library")
 endif()

 if(TARGET offline_transformations)
@@ -44,6 +42,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    target_compile_options(${TARGET_NAME} PRIVATE "-Wno-error=register")
 endif()

+add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+
 # perform copy
 add_custom_command(TARGET ${TARGET_NAME}
    POST_BUILD
@@ -52,12 +52,13 @@ add_custom_command(TARGET ${TARGET_NAME}

 # install

-install(TARGETS ${INSTALLED_TARGETS}
+# TODO: use ${PYTHON_VERSION}_dev component below
+# ie_cpack_add_component(${PYTHON_VERSION}_dev DEPENDS ${PYTHON_VERSION})
+
+install(TARGETS ${TARGET_NAME}
        RUNTIME DESTINATION python/${PYTHON_VERSION}/openvino/offline_transformations COMPONENT ${PYTHON_VERSION}
        LIBRARY DESTINATION python/${PYTHON_VERSION}/openvino/offline_transformations COMPONENT ${PYTHON_VERSION})

 install(PROGRAMS __init__.py
        DESTINATION python/${PYTHON_VERSION}/openvino/offline_transformations
        COMPONENT ${PYTHON_VERSION})
-
-add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx
@@ -17,8 +17,8 @@ def ApplyPOTTransformations(IENetwork network, string device):
    C.ApplyPOTTransformations(network.impl, device)


-def ApplyLowLatencyTransformation(IENetwork network, int64_t num_iterations=1):
-    C.ApplyLowLatencyTransformation(network.impl, num_iterations)
+def ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer = True):
+    C.ApplyLowLatencyTransformation(network.impl, use_const_initializer)


 def ApplyPruningTransformation(IENetwork network):
--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp
@@ -26,16 +26,9 @@ void InferenceEnginePython::ApplyPOTTransformations(InferenceEnginePython::IENet
    manager.run_passes(network.actual->getFunction());
 }

-void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, int64_t num_iterations) {
+void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer) {
    ngraph::pass::Manager manager;
-    // TODO: pass num_iterations to LowLatency
-    manager.register_pass<ngraph::pass::LowLatency>();
-    manager.register_pass<ngraph::pass::UnrollTensorIterator>();
-
-    auto pass_config = manager.get_pass_config();
-    pass_config->set_callback<ngraph::pass::UnrollTensorIterator>([](const std::shared_ptr<const ngraph::Node>& node) -> bool {
-        return node->get_rt_info().count("UNROLL_TI") == 0;
-    });
+    manager.register_pass<ngraph::pass::LowLatency2>(use_const_initializer);
    manager.run_passes(network.actual->getFunction());
 }

--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp
@@ -15,7 +15,7 @@ void ApplyMOCTransformations(InferenceEnginePython::IENetwork network, bool cf);

 void ApplyPOTTransformations(InferenceEnginePython::IENetwork network, std::string device);

-void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, int64_t num_iterations);
+void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer = true);

 void ApplyPruningTransformation(InferenceEnginePython::IENetwork network);

--- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd
+++ b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd
@@ -3,7 +3,6 @@

 from libcpp cimport bool
 from libcpp.string cimport string
-from libc.stdint cimport int64_t

 from ..inference_engine.ie_api_impl_defs cimport IENetwork

@@ -12,10 +11,10 @@ cdef extern from "offline_transformations_api_impl.hpp" namespace "InferenceEngi

    cdef void ApplyPOTTransformations(IENetwork network, string device)

-    cdef void ApplyLowLatencyTransformation(IENetwork network, int64_t num_iterations)
+    cdef void ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer)

    cdef void ApplyPruningTransformation(IENetwork network)

    cdef void GenerateMappingFile(IENetwork network, string path, bool extract_names)

-    cdef void CheckAPI()
+    cdef void CheckAPI()
--- a/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/src/openvino/test_utils/CMakeLists.txt
@@ -9,24 +9,22 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/test_utils)
 set(CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/test_utils)
 set(CMAKE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/test_utils)

-file(GLOB SOURCE
-        ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api.pyx
-        ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api_impl.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api.pyx
+                  ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api_impl.cpp
+                  ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api_impl.hpp
+                  ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api_impl_defs.pxd)

-set_source_files_properties(${SOURCE} PROPERTIES CYTHON_IS_CXX ON)
+set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/test_utils_api.pyx
+                            PROPERTIES CYTHON_IS_CXX ON)

 # create target

-cython_add_module(${TARGET_NAME} ${SOURCE})
-set(INSTALLED_TARGETS ${TARGET_NAME})
+cython_add_module(${TARGET_NAME} ${SOURCES})
 add_dependencies(${TARGET_NAME} ie_api)

 if(COMMAND ie_add_vs_version_file)
-    foreach(target IN LISTS INSTALLED_TARGETS)
-        ie_add_vs_version_file(NAME ${target}
-                               FILEDESCRIPTION "Test Utils Python library")
-    endforeach()
+    ie_add_vs_version_file(NAME ${TARGET_NAME}
+                           FILEDESCRIPTION "Test Utils Python library")
 endif()

 if(TARGET commonTestUtils)
--- a/inference-engine/ie_bridges/python/tests/test_offline_api.py
+++ b/inference-engine/ie_bridges/python/tests/test_offline_api.py
@@ -49,4 +49,4 @@ def test_pruning_transformations():

    f = ng.function_from_cnn(net)
    assert f != None
-    assert len(f.get_ops()) == 3
+    assert len(f.get_ops()) == 3
--- a/inference-engine/ie_bridges/python/wheel/.env.in
+++ b/inference-engine/ie_bridges/python/wheel/.env.in
@@ -1,16 +1,16 @@
-WHEEL_PACKAGE_NAME=${WHEEL_PACKAGE_NAME}
-WHEEL_VERSION=${WHEEL_VERSION}
-WHEEL_LICENCE_TYPE=${WHEEL_LICENCE_TYPE}
-WHEEL_AUTHOR=${WHEEL_AUTHOR}
-WHEEL_AUTHOR_EMAIL=${WHEEL_AUTHOR_EMAIL}
-WHEEL_DESC=${WHEEL_DESC}
-WHEEL_LICENSE=${WHEEL_LICENSE}
-WHEEL_REQUIREMENTS=${WHEEL_REQUIREMENTS}
-WHEEL_OVERVIEW=${WHEEL_OVERVIEW}
+WHEEL_PACKAGE_NAME=@WHEEL_PACKAGE_NAME@
+WHEEL_VERSION=@WHEEL_VERSION@
+WHEEL_LICENCE_TYPE=@WHEEL_LICENCE_TYPE@
+WHEEL_AUTHOR=@WHEEL_AUTHOR@
+WHEEL_AUTHOR_EMAIL=@WHEEL_AUTHOR_EMAIL@
+WHEEL_DESC=@WHEEL_DESC@
+WHEEL_LICENSE=@WHEEL_LICENSE@
+WHEEL_REQUIREMENTS=@WHEEL_REQUIREMENTS@
+WHEEL_OVERVIEW=@WHEEL_OVERVIEW@

-CMAKE_BUILD_DIR=${CMAKE_BINARY_DIR}
-CORE_LIBS_DIR=${IE_CPACK_RUNTIME_PATH}
-PLUGINS_LIBS_DIR=${PLUGINS_LIBS_DIR}
-NGRAPH_LIBS_DIR=${NGRAPH_LIBS_DIR}
-TBB_LIBS_DIR=${TBB_LIBS_DIR}
-PY_PACKAGES_DIR=${PY_PACKAGES_DIR}
+CMAKE_BUILD_DIR=@CMAKE_BINARY_DIR@
+CORE_LIBS_DIR=@IE_CPACK_RUNTIME_PATH@
+PLUGINS_LIBS_DIR=@PLUGINS_LIBS_DIR@
+NGRAPH_LIBS_DIR=@NGRAPH_LIBS_DIR@
+TBB_LIBS_DIR=@TBB_LIBS_DIR@
+PY_PACKAGES_DIR=@PY_PACKAGES_DIR@
--- a/inference-engine/ie_bridges/python/wheel/CMakeLists.txt
+++ b/inference-engine/ie_bridges/python/wheel/CMakeLists.txt
@@ -16,7 +16,7 @@ set(WHEEL_REQUIREMENTS "${CMAKE_CURRENT_SOURCE_DIR}/meta/openvino.requirements.t
 set(WHEEL_OVERVIEW "${CMAKE_CURRENT_SOURCE_DIR}/meta/pypi_overview.md" CACHE STRING "Detailed description")

 set(SETUP_PY "${CMAKE_CURRENT_SOURCE_DIR}/setup.py")
-set(SETUP_ENV   "${CMAKE_CURRENT_SOURCE_DIR}/.env.in")
+set(SETUP_ENV "${CMAKE_CURRENT_SOURCE_DIR}/.env.in")

 set(CORE_LIBS_DIR ${IE_CPACK_RUNTIME_PATH})
 set(PLUGINS_LIBS_DIR ${IE_CPACK_RUNTIME_PATH})
@@ -24,7 +24,6 @@ set(NGRAPH_LIBS_DIR deployment_tools/ngraph/lib)
 set(PY_PACKAGES_DIR ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION})
 set(TBB_LIBS_DIR deployment_tools/inference_engine/external/tbb/lib)

-
 if(APPLE)
    set(WHEEL_PLATFORM macosx_10_15_x86_64)
 elseif(UNIX)
@@ -36,28 +35,40 @@ else()
    message(FATAL_ERROR "This platform is not supported")
 endif()

-configure_file(${SETUP_ENV} "${CMAKE_CURRENT_SOURCE_DIR}/.env")
+configure_file(${SETUP_ENV} "${CMAKE_CURRENT_SOURCE_DIR}/.env" @ONLY)

-add_custom_target(ie_wheel ALL DEPENDS ie_libraries ie_plugins ie_api)
+add_custom_target(ie_wheel ALL DEPENDS ie_api offline_transformations_api)

-if(TARGET _pyngraph)
-    add_dependencies(ie_wheel _pyngraph)
-endif()
+foreach(_target ie_libraries ie_plugins _pyngraph)
+    if(TARGET ${_target})
+        add_dependencies(ie_wheel ${_target})
+    endif()
+endforeach()

 if(LINUX)
-    find_host_program(patchelf_program NAMES patchelf)
+    find_host_program(patchelf_program
+                      NAMES patchelf
+                      DOC "Path to patchelf tool")
    if(NOT patchelf_program)
        message(FATAL_ERROR "patchelf is not found, which is needed to build ie_wheel")
    endif()
 endif()

 add_custom_command(TARGET ie_wheel
+    PRE_BUILD
+        COMMAND ${CMAKE_COMMAND} -E rm -rf "${CMAKE_CURRENT_BINARY_DIR}/site-packages"
+        COMMAND ${PYTHON_EXECUTABLE} ${SETUP_PY} clean bdist_wheel
+            --dist-dir ${CMAKE_BINARY_DIR}/wheels
+            --build=${WHEEL_BUILD}
+            --plat-name=${WHEEL_PLATFORM}
    POST_BUILD
-    COMMAND ${PYTHON_EXECUTABLE} ${SETUP_PY} bdist_wheel
-        --dist-dir ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/wheels
-        --build=${WHEEL_BUILD}
-        --plat-name=${WHEEL_PLATFORM}
+        COMMAND ${CMAKE_COMMAND} -E rm "${CMAKE_CURRENT_SOURCE_DIR}/.env"
    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
    COMMENT "Building Python wheel ${WHEEL_PACKAGE_NAME}"
    VERBATIM
 )
+
+set_property(TARGET ie_wheel
+    APPEND
+    PROPERTY ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/wheels"
+)
--- a/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.requirements.txt
+++ b/inference-engine/ie_bridges/python/wheel/meta/openvino-dev.requirements.txt
@@ -1,7 +1,7 @@
 defusedxml>=0.7.1
 scipy~=1.5.4
 jstyleson~=0.0.2
-numpy~=1.19.5
+numpy>=1.16.6,<1.20
 addict>=2.4.0
 pandas~=1.1.5
 hyperopt~=0.1.2
--- a/inference-engine/ie_bridges/python/wheel/meta/openvino.requirements.txt
+++ b/inference-engine/ie_bridges/python/wheel/meta/openvino.requirements.txt
@@ -1 +1 @@
-numpy~=1.19.5
+numpy>=1.16.6,<1.20
--- a/inference-engine/ie_bridges/python/wheel/setup.py
+++ b/inference-engine/ie_bridges/python/wheel/setup.py
@@ -8,9 +8,10 @@ import errno
 import subprocess  # nosec
 import typing
 from pathlib import Path
-from shutil import copyfile
+from shutil import copyfile, rmtree
 from distutils.command.install import install
 from distutils.command.build import build
+from distutils.command.clean import clean
 from distutils.errors import DistutilsSetupError
 from distutils.file_util import copy_file
 from distutils import log
@@ -160,6 +161,7 @@ class PrepareLibs(build_clib):
        # additional blacklist filter, just to fix cmake install issues
        blacklist = ['.lib', '.pdb', '_debug.dll', '_debug.dylib']
        package_dir = os.path.join(get_package_dir(PY_INSTALL_CFG), WHEEL_LIBS_INSTALL_DIR)
+
        for src_dir in src_dirs:
            local_base_dir = Path(src_dir)
            for file_path in local_base_dir.rglob('*'):
@@ -197,6 +199,22 @@ class CopyExt(build_ext):
            copy_file(src, dst, verbose=self.verbose, dry_run=self.dry_run)


+class CustomClean(clean):
+    """Clean up staging directories"""
+
+    def clean(self, install_cfg):
+        for comp, comp_data in install_cfg.items():
+            install_prefix = comp_data.get('prefix')
+            self.announce(f'Cleaning {comp}: {install_prefix}', level=3)
+            if os.path.exists(install_prefix):
+                rmtree(install_prefix)
+
+    def run(self):
+        self.clean(LIB_INSTALL_CFG)
+        self.clean(PY_INSTALL_CFG)
+        clean.run(self)
+
+
 def is_tool(name):
    """Check if the command-line tool is available"""
    try:
@@ -330,6 +348,7 @@ package_license = config('WHEEL_LICENSE', '')
 if os.path.exists(package_license):
    copyfile(package_license, 'LICENSE')

+
 packages = find_namespace_packages(','.join(get_dir_list(PY_INSTALL_CFG)))
 package_data: typing.Dict[str, list] = {}

@@ -350,6 +369,7 @@ setup(
        'install': CustomInstall,
        'build_clib': PrepareLibs,
        'build_ext': CopyExt,
+        'clean': CustomClean,
    },
    ext_modules=find_prebuilt_extensions(get_dir_list(PY_INSTALL_CFG)),
    packages=packages,
--- a/inference-engine/include/ie_transformations.hpp
+++ b/inference-engine/include/ie_transformations.hpp
@@ -52,5 +52,41 @@ namespace InferenceEngine {
 * @param network A network to apply LowLatency transformation
 * *
 */
+
+INFERENCE_ENGINE_DEPRECATED("This transformation will be removed in 2023.1. "
+                            "Use InferenceEngine::lowLatency2 instead.")
 INFERENCE_ENGINE_API_CPP(void) LowLatency(InferenceEngine::CNNNetwork& network);
+
+
+/**
+ * @brief The transformation finds all TensorIterator/Loop layers in the network,
+ * processes all back edges that describe a connection between Result and Parameter
+ * of the TensorIterator/Loop bodies,and inserts ReadValue and Assign layers at the
+ * input and output corresponding to this back edge.
+ * Supported platforms: CPU, GNA.
+ *
+ * The example below describes the changes made by the transformation
+ *  [] - TensorIterator body
+ *  () - new layer
+ *  BE - back-edge
+ *
+ *  before applying the transformation:
+ *  -> input1[BE_1 -> Parameter -> Layers ... -> Result  -> BE_1 ]output1->
+ *
+ *  after applying the transformation:
+ *  ->(ReadValue)-> input1[BE_1 ->Parameter->Layers ...->Result->BE_1]output1 ->(Assign)
+ *                                                                      \
+ *                                                                       ->...
+ * After applying the transformation, the resulting network can be inferred
+ * step by step, the states will store between inferences.
+ * @param network A network to apply LowLatency transformation
+ * @param use_const_initializer Changes the type of the initializing subgraph for ReadValue operations.
+          If "true", then the transformation inserts Constant before ReadValue operation.
+          If "false, then the transformation leaves existed initializing subgraph for ReadValue operation.
+ * Loop operation by a given number. Does not affect TensorIterators.
+ * *
+ */
+INFERENCE_ENGINE_API_CPP(void) lowLatency2(InferenceEngine::CNNNetwork& network,
+                                           bool use_const_initializer = true);
+
 } // namespace InferenceEngine
--- a/inference-engine/samples/CMakeLists.txt
+++ b/inference-engine/samples/CMakeLists.txt
@@ -129,6 +129,14 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/gflags")
    add_gflags()
 endif()

+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/zlib")
+    add_subdirectory(thirdparty/zlib EXCLUDE_FROM_ALL)
+endif()
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/cnpy")
+    add_subdirectory(thirdparty/cnpy EXCLUDE_FROM_ALL)
+endif()
+
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
 endif()
--- a/inference-engine/samples/benchmark_app/CMakeLists.txt
+++ b/inference-engine/samples/benchmark_app/CMakeLists.txt
@@ -10,4 +10,3 @@ ie_add_sample(NAME benchmark_app
              HEADERS ${HDR}
              DEPENDENCIES format_reader ie_samples_utils
              OPENCV_DEPENDENCIES core)
-
--- a/inference-engine/samples/speech_sample/CMakeLists.txt
+++ b/inference-engine/samples/speech_sample/CMakeLists.txt
@@ -2,7 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 #

+file (GLOB SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+file (GLOB HDR ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+               ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+
 ie_add_sample(NAME speech_sample
-              SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/main.cpp"
-              HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/speech_sample.hpp"
-              DEPENDENCIES ie_samples_utils)
+              SOURCES ${SRC}
+              HEADERS ${HDR}
+              DEPENDENCIES cnpy ie_samples_utils)
--- a/inference-engine/samples/speech_sample/README.md
+++ b/inference-engine/samples/speech_sample/README.md
@@ -2,7 +2,7 @@

 This sample demonstrates how to execute an Asynchronous Inference of acoustic model based on Kaldi\* neural networks and speech feature vectors.

-The sample works with Kaldi ARK files only, so it does not cover an end-to-end speech recognition scenario (speech to text), requiring additional preprocessing (feature extraction) to get a feature vector from a speech signal, as well as postprocessing (decoding) to produce text from scores.
+The sample works with Kaldi ARK or Numpy* uncompressed NPZ files, so it does not cover an end-to-end speech recognition scenario (speech to text), requiring additional preprocessing (feature extraction) to get a feature vector from a speech signal, as well as postprocessing (decoding) to produce text from scores.

 Automatic Speech Recognition C++ sample application demonstrates how to use the following Inference Engine C++ API in applications:

@@ -27,8 +27,8 @@ Basic Inference Engine API is covered by [Hello Classification C++ sample](../he

 ## How It Works

-Upon the start-up, the application reads command line parameters and loads a Kaldi-trained neural network along with Kaldi ARK speech feature vector file to the Inference Engine plugin. Then it performs inference on all speech utterances stored in the input ARK file. Context-windowed speech frames are processed in batches of 1-8
-frames according to the `-bs` parameter.  Batching across utterances is not supported by this sample.  When inference is done, the application creates an output ARK file.  If the `-r` option is given, error
+Upon the start-up, the application reads command line parameters, loads a specified model and input data to the Inference Engine plugin, performs synchronous inference on all speech utterances stored in the input file. Context-windowed speech frames are processed in batches of 1-8
+frames according to the `-bs` parameter.  Batching across utterances is not supported by this sample.  When inference is done, the application creates an output file.  If the `-r` option is given, error
 statistics are provided for each speech utterance as shown above.

 You can see the explicit description of
@@ -43,7 +43,7 @@ Several parameters control neural network quantization. The `-q` flag determines
 Three modes are supported:

 - *static* - The first
-utterance in the input ARK file is scanned for dynamic range.  The scale factor (floating point scalar multiplier) required to scale the maximum input value of the first utterance to 16384 (15 bits) is used
+utterance in the input file is scanned for dynamic range.  The scale factor (floating point scalar multiplier) required to scale the maximum input value of the first utterance to 16384 (15 bits) is used
 for all subsequent inputs.  The neural network is quantized to accommodate the scaled input dynamic range.
 - *dynamic* - The user may specify a scale factor via the `-sf` flag that will be used for static quantization.
 - *user-defined* - The scale factor for each input batch is computed
@@ -99,17 +99,17 @@ speech_sample [OPTION]
 Options:

    -h                      Print a usage message.
-    -i "<path>"             Required. Paths to .ark files. Example of usage: <file1.ark,file2.ark> or <file.ark>.
+    -i "<path>"             Required. Paths to input files. Example of usage: <file1.ark,file2.ark> or <file.ark> or <file.npz>.
    -m "<path>"             Required. Path to an .xml file with a trained model (required if -rg is missing).
-    -o "<path>"             Optional. Output file name to save ark scores.
+    -o "<path>"             Optional. Output file name to save scores. Example of usage: <output.ark> or <output.npz>
    -d "<device>"           Optional. Specify a target device to infer on. CPU, GPU, MYRIAD, GNA_AUTO, GNA_HW, GNA_SW_FP32, GNA_SW_EXACT and HETERO with combination of GNA
     as the primary device and CPU as a secondary (e.g. HETERO:GNA,CPU) are supported. The list of available devices is shown below. The sample will look for a suitable plugin for device specified.
    -pc                     Optional. Enables per-layer performance report.
-    -q "<mode>"             Optional. Input quantization mode:  "static" (default), "dynamic", or "user" (use with -sf).
+    -q "<mode>"             Optional. Input quantization mode:  static (default), dynamic, or user (use with -sf).
    -qb "<integer>"         Optional. Weight bits for quantization: 8 or 16 (default)
    -sf "<double>"          Optional. User-specified input scale factor for quantization (use with -q user). If the network contains multiple inputs, provide scale factors by separating them with commas.
    -bs "<integer>"         Optional. Batch size 1-8 (default 1)
-    -r "<path>"             Optional. Read reference score .ark file and compare scores.
+    -r "<path>"             Optional. Read referefile and compare scores. Example of usage: <reference.ark> or <reference.npz>
    -rg "<path>"            Read GNA model from file using path/filename provided (required if -m is missing).
    -wg "<path>"            Optional. Write GNA model to file using path/filename provided.
    -we "<path>"            Optional. Write GNA embedded model to file using path/filename provided.
@@ -118,10 +118,9 @@ Options:
                            If you use the cw_l or cw_r flag, then batch size and nthreads arguments are ignored.
    -cw_r "<integer>"       Optional. Number of frames for right context windows (default is 0). Works only with context window networks.
                            If you use the cw_r or cw_l flag, then batch size and nthreads arguments are ignored.
-    -oname "<outputs>"      Optional. Layer names for output blobs. The names are separated with ",". Allows to change the order of output layers for -o flag.
-                            Example: Output1:port,Output2:port.
-    -iname "<inputs>"       Optional. Layer names for input blobs. The names are separated with ",". Allows to change the order of input layers for -i flag.
-                            Example: Input1,Input2
+    -oname "<string>"       Optional. Layer names for output blobs. The names are separated with "," Example: Output1:port,Output2:port
+    -iname "<string>"       Optional. Layer names for input blobs. The names are separated with "," Example: Input1,Input2
+    -pwl_me "<double>"      Optional. The maximum percent of error for PWL function.The value must be in <0, 100> range. The default value is 1.0.

 Available target devices: <devices>

@@ -168,7 +167,7 @@ All of them can be downloaded from [https://storage.openvinotoolkit.org/models_c

 ## Sample Output

-The acoustic log likelihood sequences for all utterances are stored in the Kaldi ARK file, `scores.ark`.  If the `-r` option is used, a report on the statistical score error is generated for each utterance such as
+The acoustic log likelihood sequences for all utterances are stored in the file. Example `scores.ark` or `scores.npz`.  If the `-r` option is used, a report on the statistical score error is generated for each utterance such as
 the following:

 ```sh
--- a/inference-engine/samples/speech_sample/fileutils.cpp
+++ b/inference-engine/samples/speech_sample/fileutils.cpp
@@ -0,0 +1,144 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fileutils.hpp"
+
+void ArkFile::GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, uint32_t* ptrNumArrays, uint32_t* ptrNumMemoryBytes) {
+    uint32_t numArrays = 0;
+    uint32_t numMemoryBytes = 0;
+
+    std::ifstream in_file(fileName, std::ios::binary);
+    if (in_file.good()) {
+        while (!in_file.eof()) {
+            std::string line;
+            uint32_t numRows = 0u, numCols = 0u, num_bytes = 0u;
+            std::getline(in_file, line, '\0');  // read variable length name followed by space and NUL
+            std::getline(in_file, line, '\4');  // read "BFM" followed by space and control-D
+            if (line.compare("BFM ") != 0) {
+                break;
+            }
+            in_file.read(reinterpret_cast<char*>(&numRows), sizeof(uint32_t));  // read number of rows
+            std::getline(in_file, line, '\4');                                  // read control-D
+            in_file.read(reinterpret_cast<char*>(&numCols), sizeof(uint32_t));  // read number of columns
+            num_bytes = numRows * numCols * sizeof(float);
+            in_file.seekg(num_bytes, in_file.cur);  // read data
+
+            if (numArrays == numArrayToFindSize) {
+                numMemoryBytes += num_bytes;
+            }
+            numArrays++;
+        }
+        in_file.close();
+    } else {
+        throw std::runtime_error(std::string("Failed to open %s for reading in GetFileInfo()!\n") + fileName);
+    }
+
+    if (ptrNumArrays != NULL)
+        *ptrNumArrays = numArrays;
+    if (ptrNumMemoryBytes != NULL)
+        *ptrNumMemoryBytes = numMemoryBytes;
+}
+
+void ArkFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
+                       uint32_t* ptrNumColumns, uint32_t* ptrNumBytesPerElement) {
+    std::ifstream in_file(fileName, std::ios::binary);
+    if (in_file.good()) {
+        uint32_t i = 0;
+        while (i < arrayIndex) {
+            std::string line;
+            uint32_t numRows = 0u, numCols = 0u;
+            std::getline(in_file, line, '\0');  // read variable length name followed by space and NUL
+            std::getline(in_file, line, '\4');  // read "BFM" followed by space and control-D
+            if (line.compare("BFM ") != 0) {
+                break;
+            }
+            in_file.read(reinterpret_cast<char*>(&numRows), sizeof(uint32_t));  // read number of rows
+            std::getline(in_file, line, '\4');                                  // read control-D
+            in_file.read(reinterpret_cast<char*>(&numCols), sizeof(uint32_t));  // read number of columns
+            in_file.seekg(numRows * numCols * sizeof(float), in_file.cur);      // read data
+            i++;
+        }
+        if (!in_file.eof()) {
+            std::string line;
+            std::getline(in_file, ptrName, '\0');  // read variable length name followed by space and NUL
+            std::getline(in_file, line, '\4');     // read "BFM" followed by space and control-D
+            if (line.compare("BFM ") != 0) {
+                throw std::runtime_error(std::string("Cannot find array specifier in file %s in LoadFile()!\n") + fileName);
+            }
+            in_file.read(reinterpret_cast<char*>(ptrNumRows), sizeof(uint32_t));     // read number of rows
+            std::getline(in_file, line, '\4');                                       // read control-D
+            in_file.read(reinterpret_cast<char*>(ptrNumColumns), sizeof(uint32_t));  // read number of columns
+            in_file.read(reinterpret_cast<char*>(&memory.front()),
+                         *ptrNumRows * *ptrNumColumns * sizeof(float));  // read array data
+        }
+        in_file.close();
+    } else {
+        throw std::runtime_error(std::string("Failed to open %s for reading in LoadFile()!\n") + fileName);
+    }
+
+    *ptrNumBytesPerElement = sizeof(float);
+}
+
+void ArkFile::SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) {
+    std::ios_base::openmode mode = std::ios::binary;
+    if (shouldAppend) {
+        mode |= std::ios::app;
+    }
+    std::ofstream out_file(fileName, mode);
+    if (out_file.good()) {
+        out_file.write(name.c_str(), name.length());  // write name
+        out_file.write("\0", 1);
+        out_file.write("BFM ", 4);
+        out_file.write("\4", 1);
+        out_file.write(reinterpret_cast<char*>(&numRows), sizeof(uint32_t));
+        out_file.write("\4", 1);
+        out_file.write(reinterpret_cast<char*>(&numColumns), sizeof(uint32_t));
+        out_file.write(reinterpret_cast<char*>(ptrMemory), numRows * numColumns * sizeof(float));
+        out_file.close();
+    } else {
+        throw std::runtime_error(std::string("Failed to open %s for writing in SaveFile()!\n") + fileName);
+    }
+}
+
+void NumpyFile::GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, uint32_t* ptrNumArrays, uint32_t* ptrNumMemoryBytes) {
+    uint32_t numArrays = 0;
+    uint32_t numMemoryBytes = 0;
+
+    cnpy::npz_t my_npz1 = cnpy::npz_load(fileName);
+    auto it = my_npz1.begin();
+    std::advance(it, numArrayToFindSize);
+
+    numArrays = my_npz1.size();
+    cnpy::NpyArray my_npy = it->second;
+    numMemoryBytes = my_npy.data_holder->size();
+
+    if (ptrNumArrays != NULL)
+        *ptrNumArrays = numArrays;
+    if (ptrNumMemoryBytes != NULL)
+        *ptrNumMemoryBytes = numMemoryBytes;
+}
+
+void NumpyFile::LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
+                         uint32_t* ptrNumColumns, uint32_t* ptrNumBytesPerElement) {
+    cnpy::npz_t my_npz1 = cnpy::npz_load(fileName);
+    auto it = my_npz1.begin();
+    std::advance(it, arrayIndex);
+    ptrName = it->first;
+    cnpy::NpyArray my_npy = it->second;
+    *ptrNumRows = my_npy.shape[0];
+    *ptrNumColumns = my_npy.shape[1];
+
+    for (size_t i = 0; i < my_npy.data_holder->size(); i++) {
+        memory.at(i) = my_npy.data_holder->at(i);
+    }
+
+    *ptrNumBytesPerElement = sizeof(float);
+}
+
+void NumpyFile::SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) {
+    std::string mode;
+    shouldAppend ? mode = "a" : mode = "w";
+    std::vector<size_t> shape {numRows, numColumns};
+    cnpy::npz_save(fileName, name, reinterpret_cast<float*>(ptrMemory), shape, mode);
+}
--- a/inference-engine/samples/speech_sample/fileutils.hpp
+++ b/inference-engine/samples/speech_sample/fileutils.hpp
@@ -0,0 +1,100 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <cnpy.h>
+
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+/// @brief Interface to work with files like input and output
+class BaseFile {
+public:
+    virtual void LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
+                          uint32_t* ptrNumColumns, uint32_t* ptrNumBytesPerElement) = 0;
+
+    virtual void SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) = 0;
+
+    virtual void GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, uint32_t* ptrNumArrays, uint32_t* ptrNumMemoryBytes) = 0;
+};
+
+/// @brief Responsible to work with .ark files
+class ArkFile : public BaseFile {
+public:
+    /**
+     * @brief Get info from Kaldi ARK speech feature vector file
+     * @param fileName .ark file name
+     * @param numArrayToFindSize number speech feature vectors in the file
+     * @param ptrNumArrays pointer to specific number array
+     * @param ptrNumMemoryBytes pointer to specific number of memory bytes
+     * @return none.
+     */
+    virtual void GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, uint32_t* ptrNumArrays, uint32_t* ptrNumMemoryBytes);
+
+    /**
+     * @brief Load Kaldi ARK speech feature vector file
+     * @param fileName .ark file name
+     * @param arrayIndex number speech feature vector in the file
+     * @param ptrName reference to variable length name
+     * @param memory reference to speech feature vector to save
+     * @param ptrNumRows pointer to number of rows to read
+     * @param ptrNumColumns pointer to number of columns to read
+     * @param ptrNumBytesPerElement pointer to number bytes per element (size of float by default)
+     * @return none.
+     */
+    virtual void LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
+                          uint32_t* ptrNumColumns, uint32_t* ptrNumBytesPerElement);
+
+    /**
+     * @brief Save Kaldi ARK speech feature vector file
+     * @param fileName .ark file name
+     * @param shouldAppend bool flag to rewrite or add to the end of file
+     * @param name reference to variable length name
+     * @param ptrMemory pointer to speech feature vector to save
+     * @param numRows number of rows
+     * @param numColumns number of columns
+     * @return none.
+     */
+    virtual void SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns);
+};
+
+/// @brief Responsible to work with .npz files
+class NumpyFile : public BaseFile {
+public:
+    /**
+     * @brief Get info from Numpy* uncompressed NPZ speech feature vector file
+     * @param fileName .npz file name
+     * @param numArrayToFindSize number speech feature vectors in the file
+     * @param ptrNumArrays pointer to specific number array
+     * @param ptrNumMemoryBytes pointer to specific number of memory bytes
+     * @return none.
+     */
+    virtual void GetFileInfo(const char* fileName, uint32_t numArrayToFindSize, uint32_t* ptrNumArrays, uint32_t* ptrNumMemoryBytes);
+
+    /**
+     * @brief Load Numpy* uncompressed NPZ speech feature vector file
+     * @param fileName .npz file name
+     * @param arrayIndex number speech feature vector in the file
+     * @param ptrName reference to variable length name
+     * @param memory reference to speech feature vector to save
+     * @param ptrNumRows pointer to number of rows to read
+     * @param ptrNumColumns pointer to number of columns to read
+     * @param ptrNumBytesPerElement pointer to number bytes per element (size of float by default)
+     * @return none.
+     */
+    virtual void LoadFile(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
+                          uint32_t* ptrNumColumns, uint32_t* ptrNumBytesPerElement);
+
+    /**
+     * @brief Save Numpy* uncompressed NPZ speech feature vector file
+     * @param fileName .npz file name
+     * @param shouldAppend bool flag to rewrite or add to the end of file
+     * @param name reference to variable length name
+     * @param ptrMemory pointer to speech feature vector to save
+     * @param numRows number of rows
+     * @param numColumns number of columns
+     * @return none.
+     */
+    virtual void SaveFile(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns);
+};
--- a/inference-engine/samples/speech_sample/main.cpp
+++ b/inference-engine/samples/speech_sample/main.cpp
@@ -24,6 +24,7 @@
 #include <utility>
 #include <vector>

+#include "fileutils.hpp"
 #include "speech_sample.hpp"

 #define MAX_SCORE_DIFFERENCE 0.0001f  // max score difference for frame error threshold
@@ -63,144 +64,15 @@ struct InferRequestStruct {
 /**
 * @brief Check number of input files and model network inputs
 * @param numInputs number model inputs
- * @param numInputArkFiles number of input ARK files
+ * @param numInputFiles number of input files
 * @return none.
 */
-void CheckNumberOfInputs(size_t numInputs, size_t numInputArkFiles) {
-    if (numInputs != numInputArkFiles) {
+void CheckNumberOfInputs(size_t numInputs, size_t numInputFiles) {
+    if (numInputs != numInputFiles) {
        throw std::logic_error("Number of network inputs (" + std::to_string(numInputs) +
                               ")"
-                               " is not equal to number of ark files (" +
-                               std::to_string(numInputArkFiles) + ")");
-    }
-}
-
-/**
- * @brief Get info from Kaldi ARK speech feature vector file
- * @param fileName .ark file name
- * @param numArrayToFindSize number speech feature vectors in the file
- * @param ptrNumArrays pointer to specific number array
- * @param ptrNumMemoryBytes pointer to specific number of memory bytes
- * @return none.
- */
-void GetKaldiArkInfo(const char* fileName, uint32_t numArrayToFindSize, uint32_t* ptrNumArrays, uint32_t* ptrNumMemoryBytes) {
-    uint32_t numArrays = 0;
-    uint32_t numMemoryBytes = 0;
-
-    std::ifstream in_file(fileName, std::ios::binary);
-    if (in_file.good()) {
-        while (!in_file.eof()) {
-            std::string line;
-            uint32_t numRows = 0u, numCols = 0u, num_bytes = 0u;
-            std::getline(in_file, line, '\0');  // read variable length name followed by space and NUL
-            std::getline(in_file, line, '\4');  // read "BFM" followed by space and control-D
-            if (line.compare("BFM ") != 0) {
-                break;
-            }
-            in_file.read(reinterpret_cast<char*>(&numRows), sizeof(uint32_t));  // read number of rows
-            std::getline(in_file, line, '\4');                                  // read control-D
-            in_file.read(reinterpret_cast<char*>(&numCols), sizeof(uint32_t));  // read number of columns
-            num_bytes = numRows * numCols * sizeof(float);
-            in_file.seekg(num_bytes, in_file.cur);  // read data
-
-            if (numArrays == numArrayToFindSize) {
-                numMemoryBytes += num_bytes;
-            }
-            numArrays++;
-        }
-        in_file.close();
-    } else {
-        fprintf(stderr, "Failed to open %s for reading in GetKaldiArkInfo()!\n", fileName);
-        exit(-1);
-    }
-
-    if (ptrNumArrays != NULL)
-        *ptrNumArrays = numArrays;
-    if (ptrNumMemoryBytes != NULL)
-        *ptrNumMemoryBytes = numMemoryBytes;
-}
-
-/**
- * @brief Load Kaldi ARK speech feature vector file
- * @param fileName .ark file name
- * @param arrayIndex number speech feature vector in the file
- * @param ptrName reference to variable length name
- * @param memory reference to speech feature vector to save
- * @param ptrNumRows pointer to number of rows to read
- * @param ptrNumColumns pointer to number of columns to read
- * @param ptrNumBytesPerElement pointer to number bytes per element (size of float by default)
- * @return none.
- */
-void LoadKaldiArkArray(const char* fileName, uint32_t arrayIndex, std::string& ptrName, std::vector<uint8_t>& memory, uint32_t* ptrNumRows,
-                       uint32_t* ptrNumColumns, uint32_t* ptrNumBytesPerElement) {
-    std::ifstream in_file(fileName, std::ios::binary);
-    if (in_file.good()) {
-        uint32_t i = 0;
-        while (i < arrayIndex) {
-            std::string line;
-            uint32_t numRows = 0u, numCols = 0u;
-            std::getline(in_file, line, '\0');  // read variable length name followed by space and NUL
-            std::getline(in_file, line, '\4');  // read "BFM" followed by space and control-D
-            if (line.compare("BFM ") != 0) {
-                break;
-            }
-            in_file.read(reinterpret_cast<char*>(&numRows), sizeof(uint32_t));  // read number of rows
-            std::getline(in_file, line, '\4');                                  // read control-D
-            in_file.read(reinterpret_cast<char*>(&numCols), sizeof(uint32_t));  // read number of columns
-            in_file.seekg(numRows * numCols * sizeof(float), in_file.cur);      // read data
-            i++;
-        }
-        if (!in_file.eof()) {
-            std::string line;
-            std::getline(in_file, ptrName, '\0');  // read variable length name followed by space and NUL
-            std::getline(in_file, line, '\4');     // read "BFM" followed by space and control-D
-            if (line.compare("BFM ") != 0) {
-                fprintf(stderr, "Cannot find array specifier in file %s in LoadKaldiArkArray()!\n", fileName);
-                exit(-1);
-            }
-            in_file.read(reinterpret_cast<char*>(ptrNumRows), sizeof(uint32_t));     // read number of rows
-            std::getline(in_file, line, '\4');                                       // read control-D
-            in_file.read(reinterpret_cast<char*>(ptrNumColumns), sizeof(uint32_t));  // read number of columns
-            in_file.read(reinterpret_cast<char*>(&memory.front()),
-                         *ptrNumRows * *ptrNumColumns * sizeof(float));  // read array data
-        }
-        in_file.close();
-    } else {
-        fprintf(stderr, "Failed to open %s for reading in LoadKaldiArkArray()!\n", fileName);
-        exit(-1);
-    }
-
-    *ptrNumBytesPerElement = sizeof(float);
-}
-
-/**
- * @brief Save Kaldi ARK speech feature vector file
- * @param fileName .ark file name
- * @param shouldAppend bool flag to rewrite or add to the end of file
- * @param name reference to variable length name
- * @param ptrMemory pointer to speech feature vector to save
- * @param numRows number of rows
- * @param numColumns number of columns
- * @return none.
- */
-void SaveKaldiArkArray(const char* fileName, bool shouldAppend, std::string name, void* ptrMemory, uint32_t numRows, uint32_t numColumns) {
-    std::ios_base::openmode mode = std::ios::binary;
-    if (shouldAppend) {
-        mode |= std::ios::app;
-    }
-    std::ofstream out_file(fileName, mode);
-    if (out_file.good()) {
-        out_file.write(name.c_str(), name.length());  // write name
-        out_file.write("\0", 1);
-        out_file.write("BFM ", 4);
-        out_file.write("\4", 1);
-        out_file.write(reinterpret_cast<char*>(&numRows), sizeof(uint32_t));
-        out_file.write("\4", 1);
-        out_file.write(reinterpret_cast<char*>(&numColumns), sizeof(uint32_t));
-        out_file.write(reinterpret_cast<char*>(ptrMemory), numRows * numColumns * sizeof(float));
-        out_file.close();
-    } else {
-        throw std::runtime_error(std::string("Failed to open %s for writing in SaveKaldiArkArray()!\n") + fileName);
+                               " is not equal to number of input files (" +
+                               std::to_string(numInputFiles) + ")");
    }
 }

@@ -637,7 +509,20 @@ int main(int argc, char* argv[]) {
            return 0;
        }

-        std::vector<std::string> inputArkFiles;
+        BaseFile* file;
+        BaseFile* fileOutput;
+        ArkFile arkFile;
+        NumpyFile numpyFile;
+        auto extInputFile = fileExt(FLAGS_i);
+        if (extInputFile == "ark") {
+            file = &arkFile;
+        } else if (extInputFile == "npz") {
+            file = &numpyFile;
+        } else {
+            throw std::logic_error("Invalid input file");
+        }
+
+        std::vector<std::string> inputFiles;
        std::vector<uint32_t> numBytesThisUtterance;
        uint32_t numUtterances(0);
        if (!FLAGS_i.empty()) {
@@ -646,19 +531,19 @@ int main(int argc, char* argv[]) {

            uint32_t currentNumUtterances(0), currentNumBytesThisUtterance(0);
            while (getline(stream, outStr, ',')) {
-                std::string filename(fileNameNoExt(outStr) + ".ark");
-                inputArkFiles.push_back(filename);
+                std::string filename(fileNameNoExt(outStr) + "." + extInputFile);
+                inputFiles.push_back(filename);

-                GetKaldiArkInfo(filename.c_str(), 0, &currentNumUtterances, &currentNumBytesThisUtterance);
+                file->GetFileInfo(filename.c_str(), 0, &currentNumUtterances, &currentNumBytesThisUtterance);
                if (numUtterances == 0) {
                    numUtterances = currentNumUtterances;
                } else if (currentNumUtterances != numUtterances) {
-                    throw std::logic_error("Incorrect input files. Number of utterance must be the same for all ark files");
+                    throw std::logic_error("Incorrect input files. Number of utterance must be the same for all input files");
                }
                numBytesThisUtterance.push_back(currentNumBytesThisUtterance);
            }
        }
-        size_t numInputArkFiles(inputArkFiles.size());
+        size_t numInputFiles(inputFiles.size());
        // -----------------------------------------------------------------------------------------------------

        // --------------------------- Step 1. Initialize inference engine core -------------------------------------
@@ -689,7 +574,7 @@ int main(int argc, char* argv[]) {
        if (!FLAGS_m.empty()) {
            /** Read network model **/
            network = ie.ReadNetwork(FLAGS_m);
-            CheckNumberOfInputs(network.getInputsInfo().size(), numInputArkFiles);
+            CheckNumberOfInputs(network.getInputsInfo().size(), numInputFiles);
            // -------------------------------------------------------------------------------------------------

            // --------------------------- Set batch size ---------------------------------------------------
@@ -718,9 +603,9 @@ int main(int argc, char* argv[]) {
                slog::warn << "Custom scale factor will be ignored - using scale factor from provided imported gna model: " << FLAGS_rg << slog::endl;
            } else {
                auto scaleFactorInput = ParseScaleFactors(FLAGS_sf);
-                if (numInputArkFiles != scaleFactorInput.size()) {
+                if (numInputFiles != scaleFactorInput.size()) {
                    std::string errMessage("Incorrect command line for multiple inputs: " + std::to_string(scaleFactorInput.size()) +
-                                           " scale factors provided for " + std::to_string(numInputArkFiles) + " input files.");
+                                           " scale factors provided for " + std::to_string(numInputFiles) + " input files.");
                    throw std::logic_error(errMessage);
                }

@@ -735,14 +620,14 @@ int main(int argc, char* argv[]) {
            if (!FLAGS_rg.empty()) {
                slog::info << "Using scale factor from provided imported gna model: " << FLAGS_rg << slog::endl;
            } else {
-                for (size_t i = 0; i < numInputArkFiles; i++) {
-                    auto inputArkName = inputArkFiles[i].c_str();
+                for (size_t i = 0; i < numInputFiles; i++) {
+                    auto inputFileName = inputFiles[i].c_str();
                    std::string name;
                    std::vector<uint8_t> ptrFeatures;
                    uint32_t numArrays(0), numBytes(0), numFrames(0), numFrameElements(0), numBytesPerElement(0);
-                    GetKaldiArkInfo(inputArkName, 0, &numArrays, &numBytes);
+                    file->GetFileInfo(inputFileName, 0, &numArrays, &numBytes);
                    ptrFeatures.resize(numBytes);
-                    LoadKaldiArkArray(inputArkName, 0, name, ptrFeatures, &numFrames, &numFrameElements, &numBytesPerElement);
+                    file->LoadFile(inputFileName, 0, name, ptrFeatures, &numFrames, &numFrameElements, &numBytesPerElement);
                    auto floatScaleFactor = ScaleFactorForQuantization(ptrFeatures.data(), MAX_VAL_2B_FEAT, numFrames * numFrameElements);
                    slog::info << "Using scale factor of " << floatScaleFactor << " calculated from first utterance." << slog::endl;
                    std::string scaleFactorConfigKey = GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_") + std::to_string(i);
@@ -840,7 +725,7 @@ int main(int argc, char* argv[]) {
        // --------------------------- Prepare input blobs -----------------------------------------------------
        /** Taking information about all topology inputs **/
        ConstInputsDataMap cInputInfo = executableNet.GetInputsInfo();
-        CheckNumberOfInputs(cInputInfo.size(), numInputArkFiles);
+        CheckNumberOfInputs(cInputInfo.size(), numInputFiles);

        /** Stores all input blobs data **/
        std::vector<Blob::Ptr> ptrInputBlobs;
@@ -934,7 +819,7 @@ int main(int argc, char* argv[]) {
            std::vector<uint8_t> ptrReferenceScores;
            score_error_t frameError, totalError;

-            ptrUtterances.resize(inputArkFiles.size());
+            ptrUtterances.resize(inputFiles.size());

            // initialize memory state before starting
            for (auto&& state : inferRequests.begin()->inferRequest.QueryState()) {
@@ -954,20 +839,20 @@ int main(int argc, char* argv[]) {

                slog::info << "Number scores per frame : " << numScoresPerFrame << slog::endl;

-                /** Get information from ark file for current utterance **/
-                numFrameElementsInput.resize(numInputArkFiles);
-                for (size_t i = 0; i < inputArkFiles.size(); i++) {
+                /** Get information from input file for current utterance **/
+                numFrameElementsInput.resize(numInputFiles);
+                for (size_t i = 0; i < inputFiles.size(); i++) {
                    std::vector<uint8_t> ptrUtterance;
-                    auto inputArkFilename = inputArkFiles[i].c_str();
+                    auto inputFilename = inputFiles[i].c_str();
                    uint32_t currentNumFrames(0), currentNumFrameElementsInput(0), currentNumBytesPerElementInput(0);
-                    GetKaldiArkInfo(inputArkFilename, utteranceIndex, &n, &numBytesThisUtterance[i]);
+                    file->GetFileInfo(inputFilename, utteranceIndex, &n, &numBytesThisUtterance[i]);
                    ptrUtterance.resize(numBytesThisUtterance[i]);
-                    LoadKaldiArkArray(inputArkFilename, utteranceIndex, uttName, ptrUtterance, &currentNumFrames, &currentNumFrameElementsInput,
-                                      &currentNumBytesPerElementInput);
+                    file->LoadFile(inputFilename, utteranceIndex, uttName, ptrUtterance, &currentNumFrames, &currentNumFrameElementsInput,
+                                   &currentNumBytesPerElementInput);
                    if (numFrames == 0) {
                        numFrames = currentNumFrames;
                    } else if (numFrames != currentNumFrames) {
-                        std::string errMessage("Number of frames in ark files is different: " + std::to_string(numFrames) + " and " +
+                        std::string errMessage("Number of frames in input files is different: " + std::to_string(numFrames) + " and " +
                                               std::to_string(currentNumFrames));
                        throw std::logic_error(errMessage);
                    }
@@ -979,19 +864,28 @@ int main(int argc, char* argv[]) {
                int i = 0;
                for (auto& ptrInputBlob : ptrInputBlobs) {
                    if (ptrInputBlob->size() != numFrameElementsInput[i++] * batchSize) {
-                        throw std::logic_error("network input size(" + std::to_string(ptrInputBlob->size()) + ") mismatch to ark file size (" +
+                        throw std::logic_error("network input size(" + std::to_string(ptrInputBlob->size()) + ") mismatch to input file size (" +
                                               std::to_string(numFrameElementsInput[i - 1] * batchSize) + ")");
                    }
                }

                ptrScores.resize(numFrames * numScoresPerFrame * sizeof(float));
                if (!FLAGS_r.empty()) {
-                    /** Read ark file with reference scores **/
+                    /** Read file with reference scores **/
+                    BaseFile* fileReferenceScores;
+                    auto exReferenceScoresFile = fileExt(FLAGS_r);
+                    if (exReferenceScoresFile == "ark") {
+                        fileReferenceScores = &arkFile;
+                    } else if (exReferenceScoresFile == "npz") {
+                        fileReferenceScores = &numpyFile;
+                    } else {
+                        throw std::logic_error("Invalid Reference Scores file");
+                    }
                    std::string refUtteranceName;
-                    GetKaldiArkInfo(reference_name_files[next_output].c_str(), utteranceIndex, &n, &numBytesReferenceScoreThisUtterance);
+                    fileReferenceScores->GetFileInfo(reference_name_files[next_output].c_str(), utteranceIndex, &n, &numBytesReferenceScoreThisUtterance);
                    ptrReferenceScores.resize(numBytesReferenceScoreThisUtterance);
-                    LoadKaldiArkArray(reference_name_files[next_output].c_str(), utteranceIndex, refUtteranceName, ptrReferenceScores, &numFramesReference,
-                                      &numFrameElementsReference, &numBytesPerElementReference);
+                    fileReferenceScores->LoadFile(reference_name_files[next_output].c_str(), utteranceIndex, refUtteranceName, ptrReferenceScores,
+                                                  &numFramesReference, &numFrameElementsReference, &numBytesPerElementReference);
                }

                double totalTime = 0.0;
@@ -1009,7 +903,7 @@ int main(int argc, char* argv[]) {
                std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> callPerfMap;

                size_t frameIndex = 0;
-                uint32_t numFramesArkFile = numFrames;
+                uint32_t numFramesFile = numFrames;
                numFrames += FLAGS_cw_l + FLAGS_cw_r;
                uint32_t numFramesThisBatch {batchSize};

@@ -1120,7 +1014,7 @@ int main(int argc, char* argv[]) {
                        }

                        /** Iterate over all the input blobs **/
-                        for (size_t i = 0; i < numInputArkFiles; ++i) {
+                        for (size_t i = 0; i < numInputFiles; ++i) {
                            MemoryBlob::Ptr minput = as<MemoryBlob>(ptrInputBlobs[i]);
                            if (!minput) {
                                std::string errMessage("We expect ptrInputBlobs[" + std::to_string(i) + "] to be inherited from MemoryBlob, " +
@@ -1141,14 +1035,14 @@ int main(int argc, char* argv[]) {
                        inferRequest.numFramesThisBatch = numFramesThisBatch;

                        frameIndex += numFramesThisBatch;
-                        for (size_t j = 0; j < inputArkFiles.size(); j++) {
+                        for (size_t j = 0; j < inputFiles.size(); j++) {
                            if (FLAGS_cw_l > 0 || FLAGS_cw_r > 0) {
                                int idx = frameIndex - FLAGS_cw_l;
-                                if (idx > 0 && idx < static_cast<int>(numFramesArkFile)) {
+                                if (idx > 0 && idx < static_cast<int>(numFramesFile)) {
                                    inputFrame[j] += sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
-                                } else if (idx >= static_cast<int>(numFramesArkFile)) {
+                                } else if (idx >= static_cast<int>(numFramesFile)) {
                                    inputFrame[j] =
-                                        &ptrUtterances[j].front() + (numFramesArkFile - 1) * sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
+                                        &ptrUtterances[j].front() + (numFramesFile - 1) * sizeof(float) * numFrameElementsInput[j] * numFramesThisBatch;
                                } else if (idx <= 0) {
                                    inputFrame[j] = &ptrUtterances[j].front();
                                }
@@ -1179,9 +1073,17 @@ int main(int argc, char* argv[]) {
                // --------------------------- Step 8. Process output part 2 -------------------------------------------------------

                if (!FLAGS_o.empty()) {
+                    auto exOutputScoresFile = fileExt(FLAGS_o);
+                    if (exOutputScoresFile == "ark") {
+                        fileOutput = &arkFile;
+                    } else if (exOutputScoresFile == "npz") {
+                        fileOutput = &numpyFile;
+                    } else {
+                        throw std::logic_error("Invalid Reference Scores file");
+                    }
                    /* Save output data to file */
                    bool shouldAppend = (utteranceIndex == 0) ? false : true;
-                    SaveKaldiArkArray(output_name_files[next_output].c_str(), shouldAppend, uttName, &ptrScores.front(), numFramesArkFile, numScoresPerFrame);
+                    fileOutput->SaveFile(output_name_files[next_output].c_str(), shouldAppend, uttName, &ptrScores.front(), numFramesFile, numScoresPerFrame);
                }

                /** Show performance results **/
--- a/inference-engine/samples/speech_sample/speech_sample.hpp
+++ b/inference-engine/samples/speech_sample/speech_sample.hpp
@@ -14,7 +14,7 @@
 static const char help_message[] = "Print a usage message.";

 /// @brief message for images argument
-static const char input_message[] = "Required. Paths to .ark files. Example of usage: <file1.ark,file2.ark> or <file.ark>.";
+static const char input_message[] = "Required. Paths to input files. Example of usage: <file1.ark,file2.ark> or <file.ark> or <file.npz>.";

 /// @brief message for model argument
 static const char model_message[] = "Required. Path to an .xml file with a trained model (required if -rg is missing).";
@@ -49,10 +49,10 @@ static const char custom_cpu_library_message[] = "Required for CPU plugin custom
                                                 "Absolute path to a shared library with the kernels implementations.";

 /// @brief message for score output argument
-static const char output_message[] = "Optional. Output file name to save ark scores.";
+static const char output_message[] = "Optional. Output file name to save scores. Example of usage: <output.ark> or <output.npz>";

 /// @brief message for reference score file argument
-static const char reference_score_message[] = "Optional. Read reference score .ark file and compare scores.";
+static const char reference_score_message[] = "Optional. Read reference score file and compare scores. Example of usage: <reference.ark> or <reference.npz>";

 /// @brief message for read GNA model argument
 static const char read_gna_model_message[] = "Read GNA model from file using path/filename provided (required if -m is missing).";
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@@ -425,6 +425,11 @@ InferenceEngine::CNNNetwork clDNNEngine::CloneAndTransformNetwork(const Inferenc
            auto pass_config = manager.get_pass_config();
            pass_config->set_callback<ngraph::pass::UnrollTensorIterator>(
                [config](const std::shared_ptr<const ngraph::Node> &node) -> bool {
+                    auto sub_graph_op = std::dynamic_pointer_cast<const ngraph::op::util::SubGraphOp>(node);
+                    int64_t num_iter = sub_graph_op->get_num_iterations();
+                    if (num_iter == 1) {
+                        return false;
+                    }
                    return !config.enable_loop_unrolling;
                });

--- a/inference-engine/src/cldnn_engine/ops/matmul.cpp
+++ b/inference-engine/src/cldnn_engine/ops/matmul.cpp
@@ -83,10 +83,11 @@ void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::MatMul>& o
            for (auto o = transpose_order.size(); o < 4; o++)
                transpose_order.push_back((uint16_t)o);

+            std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(transpose_order);
            auto permuteName = op->get_friendly_name() + "/transpose_b";
            auto permutePrim = cldnn::permute(permuteName,
                                              weightsName,
-                                              transpose_order);
+                                              cldnn_permute_order);
            p.AddPrimitive(permutePrim);
            p.AddInnerPrimitiveToProfiler(permuteName, layerName, op);
            weightsName = permuteName;
@@ -102,10 +103,11 @@ void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::MatMul>& o
            for (auto o = transpose_order.size(); o < 4; o++)
                transpose_order.push_back((uint16_t)o);

+            std::vector<uint16_t> cldnn_permute_order = ConvertPermuteOrder(transpose_order);
            auto permuteName = op->get_friendly_name() + "/transpose_a";
            auto permutePrim = cldnn::permute(permuteName,
                                              inputName,
-                                              transpose_order);
+                                              cldnn_permute_order);
            p.AddPrimitive(permutePrim);
            p.AddInnerPrimitiveToProfiler(permuteName, layerName, op);
            inputName = permuteName;
--- a/inference-engine/src/cldnn_engine/ops/reduce.cpp
+++ b/inference-engine/src/cldnn_engine/ops/reduce.cpp
@@ -18,6 +18,7 @@

 #include "api/reduce.hpp"
 #include "api/reorder.hpp"
+#include "api/reshape.hpp"

 namespace CLDNNPlugin {

@@ -78,6 +79,28 @@ void CreateReduceOp(Program& p, const std::shared_ptr<ngraph::Node>& op, cldnn::

    p.AddPrimitive(reducePrim);

+    auto resultLayerName = layerName;
+    auto out_dims = op->get_output_shape(0).size();
+    if (out_dims == 3 && !keep_dims && rank >= 4) {
+        resultLayerName = layerName + "_reshape";
+        auto out_shape = op->get_output_shape(0);
+        cldnn::tensor outTensor;
+        switch (rank) {
+            case 6:
+                outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]),
+                                          1, TensorValue(out_shape[2]), 1, 1);
+            case 5:
+                outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]),
+                                          1, TensorValue(out_shape[2]), 1);
+            case 4:
+                outTensor = cldnn::tensor(TensorValue(out_shape[0]), TensorValue(out_shape[1]),
+                                          1, TensorValue(out_shape[2]));
+        }
+        auto reshape_prim = cldnn::reshape(resultLayerName, layerName, outTensor);
+        p.AddPrimitive(reshape_prim);
+        p.AddPrimitiveToProfiler(op, resultLayerName);
+    }
+
    auto reorderLayerName = layerName + "_reorder";
    cldnn::format out_format = cldnn::format::any;
    auto out_dt = DataTypeFromPrecision(op->get_output_element_type(0));
@@ -89,7 +112,7 @@ void CreateReduceOp(Program& p, const std::shared_ptr<ngraph::Node>& op, cldnn::
        else if (rank - rawAxes.size() <= 4)
            out_format = cldnn::format::bfyx;

-        auto reorder_prim = cldnn::reorder(reorderLayerName, layerName, out_format, out_dt);
+        auto reorder_prim = cldnn::reorder(reorderLayerName, resultLayerName, out_format, out_dt);
        p.AddPrimitive(reorder_prim);
        p.AddPrimitiveToProfiler(op, reorderLayerName);
    } else {
--- a/inference-engine/src/gna_plugin/backend/dnn_types.h
+++ b/inference-engine/src/gna_plugin/backend/dnn_types.h
@@ -71,7 +71,7 @@ struct DnnActivation {
        return type;
    }
    static DnnActivation fromType(DnnActivationType type) {
-        DnnActivation activation;
+        DnnActivation activation{};
        activation.type = type;
        activation.args = {};
        return activation;
--- a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
@@ -26,7 +26,7 @@ class GNAFakeQuantizeLayer {
     * @brief convert FQ layer directly to gna-pwl activation layer
     */
    DnnActivation parseAsActivation() const {
-        DnnActivation fqActivation;
+        DnnActivation fqActivation{};

        fqActivation.fqParams.levels = fqLayer->GetParamAsSizeT("levels");
        auto inputShape  = getShapeForRange(fqLayer, 1);
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -2091,6 +2091,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        };

        auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+        IE_ASSERT(quantParams != nullptr);

        // Find all output layers connected to FQ
        auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer.get(), -1, donotSkip);
@@ -2304,7 +2305,7 @@ void TransposeWeightsFromNCHWToNHWCPass::run() {
                }
            }
            // Find a convolution in next layers to rotate weights columns
-            if (!l->outData.empty() && !getInputTo(l->outData[0]).empty() && !l->outData.empty() && !getInputTo(l->outData[0]).empty()) {
+            if (!l->outData.empty() && !getInputTo(l->outData[0]).empty()) {
                std::vector<TranspositionInfo> transpositionInfo;
                auto nextLayer = getInputTo(l->outData[0]).begin()->second;
                transpositionInfo = FindTranspositionInfoFromNextLayers(nextLayer);
@@ -2345,7 +2346,7 @@ void TransposeWeightsFromNCHWToNHWCPass::run() {
            }
            // Find a convolution in previous or next layers
            auto transpositionInfo = FindTranspositionInfoFromPrevLayers(firstInput);
-            if (!FoundPartToTranspose(transpositionInfo)) {
+            if (!FoundPartToTranspose(transpositionInfo) && !l->outData.empty() && !getInputTo(l->outData[0]).empty()) {
                transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
            }
            if (FoundPartToTranspose(transpositionInfo)) {
--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
@@ -219,7 +219,7 @@ export(TARGETS ${TARGET_NAME} NAMESPACE IE::

 # Export for developer package

-ie_developer_export_targets(${TARGET_NAME} ${TARGET_NAME}_plugin_api)
+ie_developer_export_targets(${TARGET_NAME}_plugin_api)

 # install TBB

@@ -281,8 +281,6 @@ install(EXPORT InferenceEngineTargets
        DESTINATION ${IE_CPACK_IE_DIR}/share
        COMPONENT core_dev)

-include(CMakePackageConfigHelpers)
-
 set(IE_NGRAPH_DIR "${CMAKE_BINARY_DIR}/ngraph")
 set(IE_INCLUDE_DIR "${PUBLIC_HEADERS_DIR}")
 set(IE_PARALLEL_CMAKE "${InferenceEngine_SOURCE_DIR}/cmake/ie_parallel.cmake")
--- a/inference-engine/src/inference_engine/ie_transformations.cpp
+++ b/inference-engine/src/inference_engine/ie_transformations.cpp
@@ -11,6 +11,16 @@ using namespace InferenceEngine;
 void InferenceEngine::LowLatency(InferenceEngine::CNNNetwork &network) {
    auto function = network.getFunction();
    ngraph::pass::Manager manager;
+    NGRAPH_SUPPRESS_DEPRECATED_START
    manager.register_pass<ngraph::pass::LowLatency>();
+    NGRAPH_SUPPRESS_DEPRECATED_END
+    manager.run_passes(function);
+}
+
+void InferenceEngine::lowLatency2(InferenceEngine::CNNNetwork &network,
+                                  bool use_const_initializer) {
+    auto function = network.getFunction();
+    ngraph::pass::Manager manager;
+    manager.register_pass<ngraph::pass::LowLatency2>(use_const_initializer);
    manager.run_passes(function);
 }
--- a/inference-engine/src/low_precision_transformations/include/low_precision/transformer.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/transformer.hpp
@@ -32,6 +32,7 @@ public:
    LowPrecisionTransformations() {}
    LowPrecisionTransformations(
        const std::map<std::string, LayerTransformationPtr>& branchSpecificTransformations,
+        const std::map<std::string, LayerTransformationPtr>& decompositionTransformations,
        const std::map<std::string, LayerTransformationPtr>& transformations,
        const std::map<std::string, std::vector<std::pair<std::string, LayerTransformationPtr>>>& cleanupTransformations,
        const std::vector<StandaloneCleanup>& standaloneCleanupTransformations);
--- a/inference-engine/src/low_precision_transformations/src/convert.cpp
+++ b/inference-engine/src/low_precision_transformations/src/convert.cpp
@@ -24,6 +24,10 @@ void ConvertTransformation::registerMatcherIn(GraphRewrite &pass, Transformation

 bool ConvertTransformation::transform(TransformationContext& context, ngraph::pattern::Matcher &m) const {
    std::shared_ptr<opset1::Convert> convert = as_type_ptr<opset1::Convert>(m.get_match_root());
+    if (!convert) {
+        return false;
+    }
+
    if (!canBeTransformed(context, convert)) {
        return false;
    }
--- a/inference-engine/src/low_precision_transformations/src/transformer.cpp
+++ b/inference-engine/src/low_precision_transformations/src/transformer.cpp
@@ -76,10 +76,12 @@ namespace low_precision {

 LowPrecisionTransformations::LowPrecisionTransformations(
    const std::map<std::string, LayerTransformationPtr>& branchSpecificTransformations,
+    const std::map<std::string, LayerTransformationPtr>& decompositionTransformations,
    const std::map<std::string, LayerTransformationPtr>& transformations,
    const std::map<std::string, std::vector<std::pair<std::string, LayerTransformationPtr>>>& cleanupTransformations,
    const std::vector<StandaloneCleanup>& standaloneCleanupTransformations) :
    branchSpecificTransformations(branchSpecificTransformations),
+    decompositionTransformations(decompositionTransformations),
    transformations(transformations),
    cleanupTransformations(cleanupTransformations),
    standaloneCleanupTransformations(standaloneCleanupTransformations) {}
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@@ -158,12 +158,11 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndBias(MKLDNNGraph &graph) {
    };

    auto isSutableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
-        if ((parentNode->isConstant() && !childNode->isConstant()) || childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() ||
-            childNode->getParentEdges().size() != 2)
+        if (childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2)
            return false;

        auto biasNode = childNode->getParentEdgesAtPort(1)[0]->getParent();
-        if (biasNode->getChildEdges().size() != 1)
+        if (biasNode->getType() != Input || !biasNode->isConstant() || biasNode->getChildEdges().size() != 1)
            return false;

        auto convOutDims = parentNode->getChildEdgesAtPort(0)[0]->getDims().ToSizeVector();
@@ -265,7 +264,7 @@ void MKLDNNGraphOptimizer::FuseDeconvolutionAndSimpleOperation(MKLDNNGraph &grap
    auto& graphNodes = graph.GetNodes();

    auto isSuitableParentNode = [](MKLDNNNodePtr node) {
-        return node->getType() == Deconvolution && node->getChildEdges().size() == 1 && node->getFusedWith().empty();
+        return node->getType() == Deconvolution && node->getChildEdges().size() == 1;
    };

    auto parent = graphNodes.begin();
@@ -277,8 +276,7 @@ void MKLDNNGraphOptimizer::FuseDeconvolutionAndSimpleOperation(MKLDNNGraph &grap
        }

        auto childNode = parentNode->getChildEdgeAt(0)->getChild();
-        // at this moment deconvolution supports only depthwise as post op
-        if (!childNode->canBePerformedAsScaleShift(parentNode.get())) {
+        if (!parentNode->canFuse(childNode)) {
            parent++;
            continue;
        }
@@ -302,6 +300,8 @@ void MKLDNNGraphOptimizer::FuseMultiplyAndAdd(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();

    auto isSutableSecondInput = [](MKLDNNNodePtr node, MKLDNNDims dataDims) {
+        if (node->getType() != Input || !node->isConstant())
+            return false;
        auto secondInputDims = node->outDims[0];
        if (secondInputDims.ndims() != dataDims.ndims() || secondInputDims.ndims() < 2)
            return false;
@@ -326,8 +326,7 @@ void MKLDNNGraphOptimizer::FuseMultiplyAndAdd(MKLDNNGraph &graph) {
    };

    auto isSutableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
-        if ((parentNode->isConstant() && !childNode->isConstant()) || childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() ||
-            childNode->getParentEdges().size() != 2)
+        if (childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2)
            return false;

        return isSutableSecondInput(childNode->getParentEdgesAtPort(1)[0]->getParent(), childNode->getParentEdgesAtPort(0)[0]->getDims());
@@ -1518,9 +1517,9 @@ void MKLDNNGraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(MKLDNNGraph
    auto& graphNodes = graph.GetNodes();

    auto getConstPort = [](const MKLDNNNodePtr node) -> int {
-        if (node->getParentEdgeAt(0)->getParent()->isConstant() && node->getParentEdgeAt(0)->getParent()->getType() == Input) {
+        if (node->getParentEdgeAt(0)->getParent()->getType() == Input && node->getParentEdgeAt(0)->getParent()->isConstant()) {
            return 0;
-        } else if (node->getParentEdgeAt(1)->getParent()->isConstant() && node->getParentEdgeAt(1)->getParent()->getType() == Input) {
+        } else if (node->getParentEdgeAt(1)->getParent()->getType() == Input && node->getParentEdgeAt(1)->getParent()->isConstant()) {
           return 1;
        } else {
            return -1;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@@ -1296,7 +1296,7 @@ bool MKLDNNNode::canBePerformedAsScaleShift(const MKLDNNNode *parentNode) const
            fusingPort = i;
            continue;
        }
-        if (!node->isConstant() || node->getType() != Input) {
+        if (node->getType() != Input || !node->isConstant()) {
            return false;
        }
    }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@@ -590,8 +590,9 @@ public:
        isInQuantizedGraph = flag;
    }

-protected:
    bool canBePerformedAsScaleShift(const MKLDNNNode *parentNode = nullptr) const;
+
+protected:
    bool canFuseSimpleOperation(const MKLDNNNodePtr& node) const;
    // TODO [mandrono]: place outside of the node API
    void fillScalesAndShifts(const MKLDNNNode *parentNode, std::vector<float> &scales, std::vector<float> &shifts, const int align = -1);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@@ -4,6 +4,7 @@

 #include "mkldnn_deconv_node.h"
 #include "mkldnn_eltwise_node.h"
+#include "mkldnn_fake_quantize_node.h"
 #include "mkldnn_input_node.h"
 #include <mkldnn.hpp>
 #include <string>
@@ -143,19 +144,23 @@ InferenceEngine::Blob::Ptr MKLDNNDeconvolutionNode::createWeiBlobAsIO(InferenceE
    return internalBlob;
 }

-bool MKLDNNDeconvolutionNode::canBeExecutedInInt8() {
-    if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
-        return false;
-
+bool MKLDNNDeconvolutionNode::canBeExecutedInInt8() const {
    // todo: [antonvor] added these checks to fix performance problems
    if (kernel.size() == 3)
        return false;
-    if (!withGroups && IC % 4 != 0 && OC % 4 != 0)
-        return false;
-
-    // todo: [antonvor] fusing is not supported yet for int8
-    if (!fusedWith.empty())
+    if (!withGroups && stride.back() > 3)
        return false;
+    if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common)) {
+        auto inDims = getChildEdgeAt(0)->getDims().ToSizeVector();
+        // heuristicConst = 2^26
+        // heuristicParam = IC^2 * SP
+        auto heuristicConst = 67108864;
+        auto heuristicParam = IC * IC;
+        for (int i = 2; i < inDims.size(); i++)
+            heuristicParam *= inDims[i];
+        if (heuristicParam > heuristicConst)
+            return false;
+    }

    for (int i = 0; i < kernel.size(); i++) {
        if (kernel[i] < stride[i])
@@ -163,7 +168,11 @@ bool MKLDNNDeconvolutionNode::canBeExecutedInInt8() {
    }

    // not supported in oneDNN
-    if (withGroups && !isDW && (IC % 16 != 0 || OC % 16 != 0))
+    int channelBlock = impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common) ? 16
+            : impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) ? 8 : 4;
+    if (withGroups && !isDW && (IC % channelBlock != 0 || OC % channelBlock != 0))
+        return false;
+    if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common) && stride.back() > 3)
        return false;

    InferenceEngine::Precision inPrecision = getOriginalInputPrecisionAtPort(0);
@@ -178,6 +187,13 @@ bool MKLDNNDeconvolutionNode::canBeExecutedInInt8() {
    return (inputDataType == dnnl_s8 || inputDataType == dnnl_u8) && weightsDataType == dnnl_s8;
 }

+bool MKLDNNDeconvolutionNode::canFuse(const MKLDNNNodePtr& node) const {
+    if (canBeExecutedInInt8())
+        return canFuseSimpleOperation(node);
+
+    return (fusedWith.empty() && node->canBePerformedAsScaleShift(this));
+}
+
 void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
    if (!descs_fwd.empty() && !descs_bwd.empty())
        return;
@@ -196,6 +212,9 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outPrecision);
    if (inputDataType == memory::data_type::bf16 || outputDataType == memory::data_type::bf16)
       inputDataType = outputDataType = memory::data_type::bf16;
+    if (!fusedWith.empty()) {
+        outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0));
+    }

    if (getParentEdges().size() != 2 && getParentEdges().size() != 3)
        IE_THROW() << errorPrefix << " has incorrect number of input edges";
@@ -240,6 +259,11 @@ void MKLDNNDeconvolutionNode::setPostOps(mkldnn::primitive_attr &attr) {
            eltwiseNode->appendPostOps(ops);
            continue;
        }
+        auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode *>(node.get());
+        if (fakeQuantizeNode) {
+            fakeQuantizeNode->appendPostOps(ops);
+            continue;
+        }
        IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented";
    }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
@@ -37,6 +37,7 @@ public:
    InferenceEngine::Precision getRuntimePrecision() const override;

    static bool isSupportedOperation(const std::shared_ptr<ngraph::Node>& op, std::string& errorMessage) noexcept;
+    bool canFuse(const MKLDNNNodePtr& node) const override;

 private:
    bool withGroups = false;
@@ -60,7 +61,7 @@ private:

    std::string errorPrefix;

-    bool canBeExecutedInInt8();
+    bool canBeExecutedInInt8() const;
    InferenceEngine::Blob::Ptr createWeiBlobAsIO(InferenceEngine::SizeVector dims);
 };

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
@@ -604,9 +604,13 @@ private:

 bool MKLDNNMVNNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
    try {
-        const auto& inDataShapeSize = op->input_value(0).get_shape().size();
-        if (inDataShapeSize < 1 || inDataShapeSize > 5) {
-            errorMessage = "First input accepts ranks from 1 to 5. Actual: " + std::to_string(inDataShapeSize);
+        if (op->get_output_partial_shape(0).rank().is_dynamic()) {
+            errorMessage = "Unsupported dynamic input rank.";
+            return false;
+        }
+        const auto& inDataRank = op->get_output_partial_shape(0).rank().get_length();
+        if (inDataRank < 1 || inDataRank > 5) {
+            errorMessage = "First input accepts ranks from 1 to 5. Actual: " + std::to_string(inDataRank);
            return false;
        }

@@ -632,21 +636,20 @@ bool MKLDNNMVNNode::isSupportedOperation(const std::shared_ptr<const ngraph::Nod
            // 4D: axes: [1,2,3], [2,3]
            // 5D: axes: [1,2,3,4], [2,3,4]
            auto axesVal = axesOp->cast_vector<int>();
-            auto& mvnShape = mvnOp->get_output_shape(0);
            for (int& axe : axesVal)
-                axe = axe < 0 ? axe + mvnShape.size() : axe;
+                axe = axe < 0 ? axe + inDataRank : axe;
            std::sort(axesVal.begin(), axesVal.end());
-            if (mvnShape.size() == 1) {
+            if (inDataRank == 1) {
                if (axesVal.size() != 1 || axesVal[0] != 0) {
                    errorMessage = "Unsupported axes.";
                    return false;
                }
            } else {
-                if (mvnShape.size() > 5 || (mvnShape.size() != axesVal.size() + 1 && mvnShape.size() != axesVal.size() + 2)) {
+                if (inDataRank > 5 || (inDataRank != axesVal.size() + 1 && inDataRank != axesVal.size() + 2)) {
                    errorMessage = "Unsupported axes.";
                    return false;
                }
-                int value = mvnShape.size() - 1;
+                int value = inDataRank - 1;
                for (int i = axesVal.size() - 1; i >= 0; i--, value--) {
                    if (axesVal[i] != value) {
                        errorMessage = "Unsupported axes.";
--- a/inference-engine/src/transformations/include/transformations/common_optimizations/split_squeeze_concat_fusion.hpp
+++ b/inference-engine/src/transformations/include/transformations/common_optimizations/split_squeeze_concat_fusion.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include <transformations_visibility.hpp>
+
+#include <ngraph/ngraph.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+#include "ngraph/pattern/matcher.hpp"
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API SplitSqueezeConcatFusion;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief SplitSqueezeConcatFusion transformation replaces group of
+ * operations: Split -> Squeeze (multiple) -> Concat to Transpose -> Reshape ops.
+ */
+class ngraph::pass::SplitSqueezeConcatFusion : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    SplitSqueezeConcatFusion();
+};
--- a/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_sinking.hpp
+++ b/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_sinking.hpp
@@ -17,23 +17,13 @@ namespace ngraph {
 namespace pass {

 class TRANSFORMATIONS_API TransposeSinking;
-class TRANSFORMATIONS_API TransposeOptimization;
 class TRANSFORMATIONS_API TransposeReduction;
 class TRANSFORMATIONS_API TransposeFQReduction;
+class TRANSFORMATIONS_API TransposeFuse;

 }  // namespace pass
 }  // namespace ngraph

-/**
- * @ingroup ie_transformation_common_api
- * @brief TransposeOptimization transformation replaces suitable Transposes with Reshape operation or optimises them out
- */
-class ngraph::pass::TransposeOptimization : public ngraph::pass::MatcherPass {
-public:
-    NGRAPH_RTTI_DECLARATION;
-    TransposeOptimization();
-};
-
 /**
 * @ingroup ie_transformation_common_api
 * @brief TransposeReduction transformation sinks Transpose through Reduce operations
@@ -54,6 +44,17 @@ public:
    TransposeFQReduction();
 };

+/**
+ * @ingroup ie_transformation_common_api
+ * @brief TransposeFuse transformation eliminates 2 consequtive Transposes if they result in no changes to input or fuses them
+ * to single Transpose if input gets changed
+ */
+class ngraph::pass::TransposeFuse : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    TransposeFuse();
+};
+
 /**
 * @ingroup ie_transformation_common_api
 * @brief TransposeSinking transformation sinks Transposes through known operations
@@ -64,6 +65,6 @@ public:
    TransposeSinking() {
        add_matcher<ngraph::pass::TransposeFQReduction>();
        add_matcher<ngraph::pass::TransposeReduction>();
-        add_matcher<ngraph::pass::TransposeOptimization>();
+        add_matcher<ngraph::pass::TransposeFuse>();
    }
-};
+};
--- a/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_to_reshape.hpp
+++ b/inference-engine/src/transformations/include/transformations/common_optimizations/transpose_to_reshape.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+#include <memory>
+
+#include <transformations_visibility.hpp>
+
+#include <ngraph/ngraph.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+#include "ngraph/pattern/matcher.hpp"
+
+namespace ngraph {
+namespace pass {
+
+class TRANSFORMATIONS_API TransposeToReshape;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief TransposeToReshape transformation replaces suitable Transposes with Reshape operation or optimizes them out
+ */
+class ngraph::pass::TransposeToReshape : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    TransposeToReshape();
+};
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/algebraic_simplification.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/algebraic_simplification.cpp
@@ -93,7 +93,7 @@ static bool simplify_gather_shapeof(shared_ptr<Node> node) {

    auto zero_axis = opset3::Constant::create<int64_t>(element::i64, Shape{}, {0});
    NodeVector new_ops;
-    auto new_shapeof = make_shared<opset3::ShapeOf>(gather->input_value(0));
+    auto new_shapeof = make_shared<opset3::ShapeOf>(gather->input_value(0), node->get_output_element_type(0));
    new_ops.push_back(new_shapeof);
    std::shared_ptr<Node> replace_op;
    if (indices_rank.get_length() == 0) {
@@ -113,7 +113,7 @@ static bool simplify_gather_shapeof(shared_ptr<Node> node) {
            new_ops.push_back(gather);
            concat_inputs.push_back(gather);
        }
-        auto shapeof_indices = make_shared<opset3::ShapeOf>(gather->input_value(1));
+        auto shapeof_indices = make_shared<opset3::ShapeOf>(gather->input_value(1), node->get_output_element_type(0));
        new_ops.push_back(shapeof_indices);

        concat_inputs.push_back(shapeof_indices);
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
@@ -41,6 +41,8 @@
 #include "transformations/common_optimizations/batch_to_space_fusion.hpp"
 #include "transformations/common_optimizations/dilated_convolution_converter.hpp"
 #include "transformations/common_optimizations/transpose_sinking.hpp"
+#include "transformations/common_optimizations/split_squeeze_concat_fusion.hpp"
+#include "transformations/common_optimizations/transpose_to_reshape.hpp"
 #include "transformations/op_conversions/bidirectional_sequences_decomposition.hpp"
 #include "transformations/op_conversions/convert_pad_to_group_conv.hpp"
 #include "transformations/op_conversions/convert_divide.hpp"
@@ -91,7 +93,13 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
    manager.register_pass<ngraph::pass::ConstantFolding>();
    manager.register_pass<ngraph::pass::StridedSliceOptimization>(); // depends on CF
    manager.register_pass<ngraph::pass::BroadcastElementwiseFusion>();
-    manager.register_pass<ngraph::pass::TransposeSinking>();
+
+    auto transpose_sinking = manager.register_pass<ngraph::pass::GraphRewrite>();
+    transpose_sinking->add_matcher<ngraph::pass::TransposeSinking>();
+    // SplitSqueezeConcatFusion should work in same GraphRewrite as TransposesSinking,
+    // because it replaces pattern that may contain Transposes which must be optimized before
+    // the transformation and it also inserts Transpose that can be optimized by TransposeSinking
+    transpose_sinking->add_matcher<ngraph::pass::SplitSqueezeConcatFusion>();

    auto eliminations = manager.register_pass<ngraph::pass::GraphRewrite>();
    eliminations->add_matcher<ngraph::pass::EliminateUnsqueezeGather>();
@@ -119,6 +127,7 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
    common_fusions->add_matcher<ngraph::pass::BatchToSpaceFusion>();
    common_fusions->add_matcher<ngraph::pass::DilatedConvolutionConverter>();
    common_fusions->add_matcher<ngraph::pass::GeluFusion>();
+    common_fusions->add_matcher<ngraph::pass::TransposeToReshape>();
    common_fusions->set_name("ngraph::pass::CommonFusions");

    manager.register_pass<ngraph::pass::ConvertPadToGroupConvolution, false>();
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/split_squeeze_concat_fusion.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/split_squeeze_concat_fusion.cpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include "transformations/common_optimizations/split_squeeze_concat_fusion.hpp"
+
+#include <memory>
+#include <vector>
+#include <numeric>
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::SplitSqueezeConcatFusion, "SplitSqueezeConcatFusion", 0);
+
+ngraph::pass::SplitSqueezeConcatFusion::SplitSqueezeConcatFusion() {
+    MATCHER_SCOPE(SplitSqueezeConcatFusion);
+    // Detect only concat, because we don't know how many inputs will go into concat
+    auto concat_pattern = ngraph::pattern::wrap_type<ngraph::opset7::Concat>();
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        const auto& pattern_to_output = m.get_pattern_value_map();
+        auto concat = std::dynamic_pointer_cast<ngraph::opset7::Concat>(pattern_to_output.at(concat_pattern).get_node_shared_ptr());
+        if (!concat) return false;
+
+        NodeVector nodes_to_delete{ concat };
+
+        int64_t axis_value = 0;
+        std::shared_ptr<ngraph::opset7::Split> split;
+
+        const auto& concat_inputs = concat->input_values();
+        if (concat_inputs.empty()) return false;
+        for (size_t i = 0; i < concat_inputs.size(); i++) {
+            auto squeeze = std::dynamic_pointer_cast<ngraph::opset7::Squeeze>(concat_inputs[i].get_node_shared_ptr());
+            if (!squeeze) return false;
+
+            nodes_to_delete.push_back(squeeze);
+
+            auto split_to_check = std::dynamic_pointer_cast<ngraph::opset7::Split>(squeeze->input_value(0).get_node_shared_ptr());
+            auto squeeze_axes = std::dynamic_pointer_cast<ngraph::opset7::Constant>(squeeze->input_value(1).get_node_shared_ptr());
+            if (!squeeze_axes || !split_to_check) return false;
+
+            auto squeeze_axes_vec = squeeze_axes->cast_vector<int64_t>();
+            if (squeeze_axes_vec.size() != 1) return false;
+
+            if (i == 0) {
+                axis_value = squeeze_axes_vec[0];
+                nodes_to_delete.push_back(split_to_check);
+                split = split_to_check;
+            } else if (axis_value != squeeze_axes_vec[0] || split_to_check != split) {
+                return false;
+            }
+
+            auto split_output = squeeze->input_value(0);
+            if (split_output.get_target_inputs().size() != 1 ||
+                split_output.get_index() != i)
+                return false;
+        }
+
+        if (split->get_num_splits() != concat_inputs.size()) return false;
+
+        auto split_axis = std::dynamic_pointer_cast<ngraph::opset7::Constant>(split->input_value(1).get_node_shared_ptr());
+        if (!split_axis) return false;
+
+        auto axis_vec = split_axis->cast_vector<int64_t>();
+        if (axis_vec.size() != 1 || axis_value != axis_vec[0])
+            return false;
+
+        auto input = split->input_value(0);
+
+        auto concat_axis = concat->get_axis();
+        auto rank = input.get_partial_shape().rank();
+        if (!rank.is_static())
+            return false;
+        std::vector<int64_t> order(rank.get_length());
+        std::iota(order.begin(), order.end(), 0);
+        order.erase(order.begin() + axis_value);
+        order.insert(order.begin() + concat_axis, axis_value);
+
+        auto transpose_order = ngraph::opset7::Constant::create(element::i64, { (size_t)rank.get_length() }, order);
+        auto transpose = register_new_node<ngraph::opset7::Transpose>(input, transpose_order);
+        auto shape_after = ngraph::opset7::Constant::create(element::i64, { (size_t)rank.get_length() - 1 }, concat->get_output_shape(0));
+        auto reshape = std::make_shared<ngraph::opset7::Reshape>(transpose, shape_after, false);
+
+        reshape->set_friendly_name(m.get_match_root()->get_friendly_name());
+        ngraph::copy_runtime_info(nodes_to_delete, { transpose, reshape });
+        ngraph::replace_node(m.get_match_root(), reshape);
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(concat_pattern, matcher_name);
+    register_matcher(m, callback);
+}
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_sinking.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_sinking.cpp
@@ -10,14 +10,15 @@
 #include <vector>

 #include <ngraph/opsets/opset6.hpp>
+#include <ngraph/opsets/opset7.hpp>
 #include <ngraph/rt_info.hpp>
 #include <ngraph/pattern/op/wrap_type.hpp>
 #include <numeric>

 NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeSinking, "TransposeSinking", 0);
-NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeOptimization, "TransposeOptimization", 0);
 NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeReduction, "TransposeReduction", 0);
 NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeFQReduction, "TransposeFQReduction", 0);
+NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeFuse, "TransposeFuse", 0);

 using namespace ngraph;

@@ -55,103 +56,6 @@ std::shared_ptr<ngraph::opset6::Constant> get_reversed_order_constant(const std:
            ngraph::element::i64, ngraph::Shape{reverse_order.size()}, reverse_order);
 }

-
-bool replace_transpose_with_reshape(const std::shared_ptr<Node>& transpose) {
-    auto data = transpose->input_value(0);
-    const auto input_shape = transpose->input(0).get_partial_shape();
-
-    const size_t input_shape_rank = input_shape.rank().get_length();
-
-    auto order = as_type_ptr<opset6::Constant>(transpose->input_value(1).get_node_shared_ptr());
-    if (!order || !ngraph::shape_size(order->get_shape())) {
-        return false;
-    }
-
-    const auto order_value = order->cast_vector<int64_t>();
-
-    // Check that transpose order without 1 dims has an ascending order
-    int64_t last_dim(-1);
-    for (size_t i = 0; i < input_shape_rank; ++i) {
-        if (input_shape[order_value[i]].is_dynamic() || input_shape[order_value[i]] != 1) {
-            if (order_value[i] < last_dim) {
-                return false;
-            }
-            last_dim = order_value[i];
-        }
-    }
-
-    // Transpose operation can be removed if original transpose order is sorted
-    // or dimension that changes their places equal to 1
-    using DimensionToPosition = struct {
-        Dimension dim;
-        size_t pos;
-    };
-    std::vector<DimensionToPosition> dims;
-    for (size_t i = 0; i < input_shape_rank; ++i) {
-        if (order_value[i] != static_cast<int64_t>(i)) {
-            dims.push_back({input_shape[order_value[i]], i});
-        }
-    }
-
-    // If number of dimensions != 1 to move equal to 0 we can remove this Transpose
-    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
-        return !(item.dim.is_static() && item.dim.get_length() == 1);
-    }) == 0) {
-        return replace_output_update_name(transpose->output(0), transpose->input_value(0));
-    }
-
-    // Transpose can be replaced with Reshape in two ways:
-    // 1. Reshape with dims as Constant
-    // 2. Reshape with dims as input (ShapeOf->Gather)
-    //
-    // The first case is possible only if one or less dynamic dimensions changes their position
-    // For example: input_shape {?, 3, 1, ?} and order {0, 1, 3, 2} can be replaced with Reshape
-    // with Constant {0, 3, -1, 1} but if input_shape {?, 1, 1, ?} and order {1, 0, 3, 2} transpose
-    // cannot be replaced int the same way and in this case its only possible to use Gather(ShapeOf,
-    // order)
-
-    Output<Node> reshape_dim;
-    NodeVector new_ops;
-
-    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
-        return item.dim.is_dynamic();
-    }) < 2) {
-        std::vector<int64_t> reshape_value(input_shape_rank, 0);
-        for (const auto& item : dims) {
-            reshape_value[item.pos] = item.dim.is_dynamic() ? -1 : item.dim.get_length();
-        }
-        reshape_dim =
-                opset3::Constant::create(element::i64, Shape{reshape_value.size()}, reshape_value);
-    } else {
-        auto shape_of = std::make_shared<opset3::ShapeOf>(data);
-        new_ops.push_back(shape_of);
-        reshape_dim = std::make_shared<opset3::Gather>(
-                shape_of, order, opset3::Constant::create(element::i64, Shape{1}, {0}));
-        new_ops.push_back(reshape_dim.get_node_shared_ptr());
-    }
-
-    auto reshape_op = std::make_shared<opset3::Reshape>(data, reshape_dim, true);
-    new_ops.push_back(reshape_op);
-
-    reshape_op->set_friendly_name(transpose->get_friendly_name());
-    copy_runtime_info(transpose, new_ops);
-    replace_node(transpose, reshape_op);
-    return true;
-}
-
-ngraph::pass::TransposeOptimization::TransposeOptimization() {
-    MATCHER_SCOPE(TransposeOptimization);
-
-    auto transpose_label = pattern::wrap_type<opset6::Transpose>(
-            {pattern::any_input(pattern::has_static_rank()), pattern::wrap_type<opset6::Constant>()});
-    ngraph::matcher_pass_callback matcher_pass_callback = [=](ngraph::pattern::Matcher &m) {
-        return replace_transpose_with_reshape(m.get_match_root());
-    };
-
-    auto m = std::make_shared<ngraph::pattern::Matcher>(transpose_label, matcher_name);
-    register_matcher(m, matcher_pass_callback);
-}
-
 ngraph::pass::TransposeReduction::TransposeReduction() {
    MATCHER_SCOPE(TransposeReduction);

@@ -271,3 +175,50 @@ ngraph::pass::TransposeFQReduction::TransposeFQReduction() {
    auto m = std::make_shared<ngraph::pattern::Matcher>(reduce_or_squeeze_label, matcher_name);
    register_matcher(m, matcher_pass_callback);
 }
+
+ngraph::pass::TransposeFuse::TransposeFuse() {
+    MATCHER_SCOPE(TransposeFuse);
+
+    auto transpose_1 = pattern::wrap_type<opset7::Transpose>({ pattern::any_input(), pattern::wrap_type<opset7::Constant>() }, pattern::consumers_count(1));
+    auto transpose_2 = pattern::wrap_type<opset7::Transpose>({ transpose_1, pattern::wrap_type<opset7::Constant>() });
+
+    ngraph::matcher_pass_callback matcher_pass_callback = [=](ngraph::pattern::Matcher& m) {
+        const auto& pattern_to_output = m.get_pattern_value_map();
+
+        auto transpose1 = pattern_to_output.at(transpose_1).get_node_shared_ptr();
+        auto transpose2 = pattern_to_output.at(transpose_2).get_node_shared_ptr();
+        auto input = transpose1->input_value(0);
+
+        auto transpose1_order = std::dynamic_pointer_cast<ngraph::opset7::Constant>(transpose1->get_input_node_shared_ptr(1));
+        auto transpose2_order = std::dynamic_pointer_cast<ngraph::opset7::Constant>(transpose2->get_input_node_shared_ptr(1));
+        if (!transpose1_order || !transpose2_order)
+            return false;
+
+        auto order1 = transpose1_order->cast_vector<int64_t>();
+        auto order2 = transpose2_order->cast_vector<int64_t>();
+        if (order1.size() != order2.size())
+            return false;
+
+        bool is_ordered = true;
+        for (size_t i = 0; i < order1.size(); i++) {
+            order2[i] = order1[order2[i]];
+            if (order2[i] != (int64_t)i)
+                is_ordered = false;
+        }
+
+        if (is_ordered) {
+            return ngraph::replace_output_update_name(transpose2->output(0), input);
+        } else {
+            auto new_order = ngraph::opset7::Constant::create(element::i64, {order2.size()}, order2);
+            auto new_transpose = register_new_node<ngraph::opset7::Transpose>(input, new_order);
+
+            ngraph::copy_runtime_info({ transpose1, transpose2 }, new_transpose);
+            ngraph::replace_node(transpose2, new_transpose);
+        }
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(transpose_2, matcher_name);
+    register_matcher(m, matcher_pass_callback);
+}
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_to_reshape.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/transpose_to_reshape.cpp
@@ -0,0 +1,115 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "itt.hpp"
+#include "transformations/common_optimizations/transpose_to_reshape.hpp"
+#include "transformations/utils/utils.hpp"
+
+#include <memory>
+#include <vector>
+
+#include <ngraph/opsets/opset6.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <numeric>
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::TransposeToReshape, "TransposeToReshape", 0);
+
+using namespace ngraph;
+
+bool replace_transpose_with_reshape(const std::shared_ptr<Node>& transpose) {
+    auto data = transpose->input_value(0);
+    const auto input_shape = transpose->input(0).get_partial_shape();
+
+    const size_t input_shape_rank = input_shape.rank().get_length();
+
+    auto order = as_type_ptr<opset6::Constant>(transpose->input_value(1).get_node_shared_ptr());
+    if (!order || !ngraph::shape_size(order->get_shape())) {
+        return false;
+    }
+
+    const auto order_value = order->cast_vector<int64_t>();
+
+    // Check that transpose order without 1 dims has an ascending order
+    int64_t last_dim(-1);
+    for (size_t i = 0; i < input_shape_rank; ++i) {
+        if (input_shape[order_value[i]].is_dynamic() || input_shape[order_value[i]] != 1) {
+            if (order_value[i] < last_dim) {
+                return false;
+            }
+            last_dim = order_value[i];
+        }
+    }
+
+    // Transpose operation can be removed if original transpose order is sorted
+    // or dimension that changes their places equal to 1
+    using DimensionToPosition = struct {
+        Dimension dim;
+        size_t pos;
+    };
+    std::vector<DimensionToPosition> dims;
+    for (size_t i = 0; i < input_shape_rank; ++i) {
+        if (order_value[i] != static_cast<int64_t>(i)) {
+            dims.push_back({ input_shape[order_value[i]], i });
+        }
+    }
+
+    // If number of dimensions != 1 to move equal to 0 we can remove this Transpose
+    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
+        return !(item.dim.is_static() && item.dim.get_length() == 1);
+        }) == 0) {
+        return replace_output_update_name(transpose->output(0), transpose->input_value(0));
+    }
+
+    // Transpose can be replaced with Reshape in two ways:
+    // 1. Reshape with dims as Constant
+    // 2. Reshape with dims as input (ShapeOf->Gather)
+    //
+    // The first case is possible only if one or less dynamic dimensions changes their position
+    // For example: input_shape {?, 3, 1, ?} and order {0, 1, 3, 2} can be replaced with Reshape
+    // with Constant {0, 3, -1, 1} but if input_shape {?, 1, 1, ?} and order {1, 0, 3, 2} transpose
+    // cannot be replaced int the same way and in this case its only possible to use Gather(ShapeOf,
+    // order)
+
+    Output<Node> reshape_dim;
+    NodeVector new_ops;
+
+    if (count_if(dims.begin(), dims.end(), [](const DimensionToPosition& item) {
+        return item.dim.is_dynamic();
+        }) < 2) {
+        std::vector<int64_t> reshape_value(input_shape_rank, 0);
+        for (const auto& item : dims) {
+            reshape_value[item.pos] = item.dim.is_dynamic() ? -1 : item.dim.get_length();
+        }
+        reshape_dim =
+            opset3::Constant::create(element::i64, Shape{ reshape_value.size() }, reshape_value);
+    } else {
+        auto shape_of = std::make_shared<opset3::ShapeOf>(data);
+        new_ops.push_back(shape_of);
+        reshape_dim = std::make_shared<opset3::Gather>(
+            shape_of, order, opset3::Constant::create(element::i64, Shape{ 1 }, { 0 }));
+        new_ops.push_back(reshape_dim.get_node_shared_ptr());
+    }
+
+    auto reshape_op = std::make_shared<opset3::Reshape>(data, reshape_dim, true);
+    new_ops.push_back(reshape_op);
+
+    reshape_op->set_friendly_name(transpose->get_friendly_name());
+    copy_runtime_info(transpose, new_ops);
+    replace_node(transpose, reshape_op);
+    return true;
+}
+
+ngraph::pass::TransposeToReshape::TransposeToReshape() {
+    MATCHER_SCOPE(TransposeToReshape);
+
+    auto transpose_label = pattern::wrap_type<opset6::Transpose>(
+        { pattern::any_input(pattern::has_static_rank()), pattern::wrap_type<opset6::Constant>() });
+    ngraph::matcher_pass_callback matcher_pass_callback = [=](ngraph::pattern::Matcher& m) {
+        return replace_transpose_with_reshape(m.get_match_root());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(transpose_label, matcher_name);
+    register_matcher(m, matcher_pass_callback);
+}
--- a/inference-engine/src/transformations/src/transformations/op_conversions/convert_divide.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_divide.cpp
@@ -26,7 +26,7 @@ ngraph::pass::ConvertDivide::ConvertDivide() {
        }

        auto pow = std::make_shared<ngraph::opset1::Power>(div->input(1).get_source_output(),
-                                                           op::Constant::create(div->get_input_element_type(1), Shape{1}, {-1}));
+                                                           op::Constant::create(div->get_input_element_type(1), Shape{}, {-1}));

        auto mul = std::make_shared<ngraph::opset1::Multiply>(div->input(0).get_source_output(), pow);

--- a/inference-engine/src/transformations/src/transformations/op_conversions/convert_minimum_to_power_and_max.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_minimum_to_power_and_max.cpp
@@ -30,14 +30,14 @@ ngraph::pass::ConvertMinimum::ConvertMinimum() {
         */

        auto neg_0 = std::make_shared<ngraph::opset1::Multiply>(minimum->input(0).get_source_output(),
-                                                                opset1::Constant::create(minimum->get_input_element_type(0), Shape{1}, {-1}));
+                                                                opset1::Constant::create(minimum->get_input_element_type(0), Shape{}, {-1}));

        auto neg_1 = std::make_shared<ngraph::opset1::Multiply>(minimum->input(1).get_source_output(),
-                                                                opset1::Constant::create(minimum->get_input_element_type(1), Shape{1}, {-1}));
+                                                                opset1::Constant::create(minimum->get_input_element_type(1), Shape{}, {-1}));

        auto max = std::make_shared<ngraph::opset1::Maximum>(neg_0, neg_1);

-        auto neg_2 = std::make_shared<ngraph::opset1::Multiply>(max, opset1::Constant::create(max->get_element_type(), Shape{1}, {-1}));
+        auto neg_2 = std::make_shared<ngraph::opset1::Multiply>(max, opset1::Constant::create(max->get_element_type(), Shape{}, {-1}));

        neg_2->set_friendly_name(minimum->get_friendly_name());
        ngraph::copy_runtime_info(minimum, {neg_0, neg_1, max, neg_2});
--- a/inference-engine/src/transformations/src/transformations/op_conversions/convert_negative.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_negative.cpp
@@ -25,7 +25,7 @@ ngraph::pass::ConvertNegative::ConvertNegative() {
        }

        auto mul = std::make_shared<ngraph::opset1::Multiply>(neg->input(0).get_source_output(),
-                                                              opset1::Constant::create(neg->get_element_type(), Shape{1}, {-1}));
+                                                              opset1::Constant::create(neg->get_element_type(), Shape{}, {-1}));
        mul->set_friendly_name(neg->get_friendly_name());
        ngraph::copy_runtime_info(neg, mul);
        ngraph::replace_node(neg, mul);
--- a/inference-engine/src/transformations/src/transformations/op_conversions/convert_subtract.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/convert_subtract.cpp
@@ -61,7 +61,7 @@ ngraph::pass::ConvertSubtract::ConvertSubtract() {
        }

        auto neg = std::make_shared<ngraph::opset1::Multiply>(sub->input(1).get_source_output(),
-                                                              opset1::Constant::create(sub->get_input_element_type(1), Shape{1}, {-1}));
+                                                              opset1::Constant::create(sub->get_input_element_type(1), Shape{}, {-1}));

        auto add = std::make_shared<ngraph::opset1::Add>(sub->input(0).get_source_output(), neg);

--- a/inference-engine/src/transformations/src/transformations/serialize.cpp
+++ b/inference-engine/src/transformations/src/transformations/serialize.cpp
@@ -80,7 +80,7 @@ class ConstantWriter {
 public:
    using FilePosition = int64_t;
    using HashValue = size_t;
-    using ConstWritePositions = std::unordered_map<HashValue, FilePosition>;
+    using ConstWritePositions = std::unordered_map<HashValue, std::pair<FilePosition, void const *>>;

    ConstantWriter(std::ostream& bin_data, bool enable_compression = true)
        : m_binary_output(bin_data)
@@ -93,18 +93,19 @@ public:
            m_binary_output.write(ptr, size);
            return offset;
        }
-        // The biggest supported models have at maximum 1-2 thousand constant nodes,
-        // with 64 bit hash that gives a probability around 1 in 10 trillion that a
-        // hash collision will appear. Because of this, a choice has been made to
-        // not perform collision detection and keep the hashing quick and seamless.
+        // This hash is weak (but efficient) and must be replace with some other
+        // more stable hash algorithm. For example current hash algorithms gives
+        // the same hash for {2, 2} and {0, 128} arrays. So we have to compare
+        // values when finding a match in hash map.
        const HashValue hash = hash_combine(ptr, size);
        const auto found = m_hash_to_file_positions.find(hash);
-        if (found != end(m_hash_to_file_positions)) {
-            return found->second;
+        if (found != end(m_hash_to_file_positions) &&
+            memcmp(static_cast<void const*>(ptr), found->second.second, size) == 0) {
+            return found->second.first;
        }

        m_binary_output.write(ptr, size);
-        m_hash_to_file_positions.insert({hash, offset});
+        m_hash_to_file_positions.insert({hash, {offset, static_cast<void const *>(ptr)}});

        return offset;
    }
--- a/inference-engine/tests/functional/inference_engine/ir_serialization/const_compression.cpp
+++ b/inference-engine/tests/functional/inference_engine/ir_serialization/const_compression.cpp
@@ -117,6 +117,25 @@ TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsFP32) {
    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(float));
 }

+TEST_F(SerializatioConstantCompressionTest, NonIdenticalConstantsI64) {
+    constexpr int unique_const_count = 2;
+    const ngraph::Shape shape{2};
+
+    // hash_combine returns the same hash for this two constants so we also check the content of arrays
+    auto A = ngraph::op::Constant::create(ngraph::element::i64, shape, {2, 2});
+    auto B = ngraph::op::Constant::create(ngraph::element::i64, shape, {0, 128});
+
+    auto ngraph_a = std::make_shared<ngraph::Function>(ngraph::NodeVector{A, B},
+                                                       ngraph::ParameterVector{});
+
+    ngraph::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_function(ngraph_a);
+
+    std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
+    std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
+
+    ASSERT_TRUE(file_size(bin_1) == unique_const_count * ngraph::shape_size(shape) * sizeof(int64_t));
+}
+
 TEST_F(SerializatioConstantCompressionTest, IdenticalConstantsTimesTwo) {
    constexpr int unique_const_count = 2;
    const ngraph::Shape shape{2, 2, 2};
--- a/inference-engine/tests/functional/inference_engine/ngraph_reader/hsigmoid_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/ngraph_reader/hsigmoid_test.cpp
@@ -130,7 +130,6 @@ TEST_F(NGraphReaderTests, ReadHSigmoidNetwork) {
 		<layer name="Multiply_744" type="Const" precision="FP32" id="4">
 			<output>
 				<port id="0" precision="FP32">
-					<dim>1</dim>
 				</port>
 			</output>
 			<blobs>
@@ -147,7 +146,6 @@ TEST_F(NGraphReaderTests, ReadHSigmoidNetwork) {
 					<dim>22</dim>
 				</port>
 				<port id="1">
-					<dim>1</dim>
 				</port>
 			</input>
 			<output>
--- a/inference-engine/tests/functional/inference_engine/transformations/algebraic_simplification.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/algebraic_simplification.cpp
@@ -19,7 +19,7 @@
 #include <transformations/common_optimizations/algebraic_simplification.hpp>
 #include <transformations/utils/utils.hpp>
 #include <transformations/init_node_info.hpp>
-#include <transformations/common_optimizations/transpose_sinking.hpp>
+#include <transformations/common_optimizations/transpose_to_reshape.hpp>

 #include "common_test_utils/ngraph_test_utils.hpp"

@@ -312,7 +312,7 @@ TEST(algebraic_simplification, replace_transpose_with_reshape) {

        pass::Manager pass_manager;
        pass_manager.register_pass<pass::Validate>();
-        pass_manager.register_pass<pass::TransposeSinking>();
+        pass_manager.register_pass<pass::TransposeToReshape>();
        pass_manager.run_passes(optimized_f);

        auto ps = baseline_f->get_results()[0]->get_output_partial_shape(0);
--- a/inference-engine/tests/functional/inference_engine/transformations/convert_divide.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/convert_divide.cpp
@@ -39,7 +39,7 @@ TEST(TransformationTests, ConvertDivide) {
        auto data = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{3, 1, 2});
        auto divide_constant = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1}, {1.5});
        auto pow = std::make_shared<ngraph::opset1::Power>(divide_constant,
-                                                           ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{1}, {-1}));
+                                                           ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{}, {-1}));
        auto mul = std::make_shared<ngraph::opset1::Multiply>(data, pow);

        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{mul}, ngraph::ParameterVector{data});
@@ -75,4 +75,38 @@ TEST(TransformationTests, ConvertDivideNegative) {

    auto res = compare_functions(f, f_ref);
    ASSERT_TRUE(res.first) << res.second;
-}
+}
+
+TEST(TransformationTests, ConvertDivideScalar) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{});
+        auto divide_constant = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{}, {1.5});
+        auto divide = std::make_shared<ngraph::opset1::Divide>(data, divide_constant);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{divide}, ngraph::ParameterVector{data});
+
+        NGRAPH_CHECK(divide->get_output_partial_shape(0).rank().get_length() == 0);
+
+        ngraph::pass::Manager m;
+        m.register_pass<ngraph::pass::InitNodeInfo>();
+        m.register_pass<ngraph::pass::ConvertDivide>();
+        m.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto data = std::make_shared<ngraph::opset1::Parameter>(ngraph::element::f32, ngraph::Shape{});
+        auto divide_constant = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{}, {1.5});
+        auto pow = std::make_shared<ngraph::opset1::Power>(divide_constant,
+                                                           ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{}, {-1}));
+        auto mul = std::make_shared<ngraph::opset1::Multiply>(data, pow);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{mul}, ngraph::ParameterVector{data});
+
+        NGRAPH_CHECK(mul->get_output_partial_shape(0).rank().get_length() == 0);
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
--- a/inference-engine/tests/functional/inference_engine/transformations/convert_precision.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/convert_precision.cpp
@@ -119,6 +119,29 @@ TEST(TransformationTests, ConvertPrecision_ShapeOf) {
    ASSERT_FALSE(has_type<ngraph::element::Type_t::f16>(f));
 }

+TEST(TransformationTests, ConvertPrecision_ConstantRelu) {
+    std::shared_ptr<Function> f(nullptr);
+    {
+        auto input = opset4::Constant::create(element::f16, Shape{1, 1000, 4}, {0});
+        auto relu1 = std::make_shared<opset4::Relu>(input);
+        auto relu2 = std::make_shared<opset4::Relu>(relu1);
+
+        f = std::make_shared<Function>(NodeVector{relu2}, ParameterVector{});
+
+        pass::Manager manager;
+
+        static const precisions_array precisions = {
+                { ngraph::element::f16, ngraph::element::f32 }
+        };
+
+        manager.register_pass<ngraph::pass::ConvertPrecision>(precisions);
+        manager.run_passes(f);
+    }
+
+    ASSERT_FALSE(has_type<ngraph::element::Type_t::i64>(f));
+    ASSERT_FALSE(has_type<ngraph::element::Type_t::f16>(f));
+}
+
 TEST(TransformationTests, ConvertPrecision_Convert) {
    std::shared_ptr<Function> f(nullptr);
    {
--- a/inference-engine/tests/functional/inference_engine/transformations/low_latency_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/low_latency_test.cpp
@@ -68,7 +68,9 @@ TEST(TransformationTests, LowLatencyLSTM) {

        ngraph::pass::Manager manager;
        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
        manager.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
        manager.register_pass<ngraph::pass::UnrollTensorIterator>();
        manager.run_passes(f);
    }
@@ -149,7 +151,9 @@ TEST(TransformationTests, LowLatencyGRU) {

        ngraph::pass::Manager manager;
        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
        manager.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
        manager.register_pass<ngraph::pass::UnrollTensorIterator>();
        manager.run_passes(f);

@@ -227,7 +231,9 @@ TEST(TransformationTests, LowLatencyRNN) {

        ngraph::pass::Manager manager;
        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
        manager.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
        manager.register_pass<ngraph::pass::UnrollTensorIterator>();
        manager.run_passes(f);

@@ -317,7 +323,9 @@ TEST(TransformationTests, LowLatencyLSTMReshape) {

        ngraph::pass::Manager manager;
        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
        manager.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
        manager.register_pass<ngraph::pass::UnrollTensorIterator>();
        manager.run_passes(f);
    }
@@ -413,7 +421,9 @@ TEST(TransformationTests, LowLatencyLSTM_Loop) {

        ngraph::pass::Manager manager;
        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
        manager.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
        manager.register_pass<ngraph::pass::UnrollTensorIterator>();
        manager.run_passes(f);
    }
--- a/inference-engine/tests/functional/inference_engine/transformations/low_latency_v2_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/low_latency_v2_test.cpp
@@ -0,0 +1,829 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+#include <queue>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pass/manager.hpp>
+
+#include <transformations/control_flow/unroll_tensor_iterator.hpp>
+#include <transformations/init_node_info.hpp>
+#include <transformations/common_optimizations/low_latency.hpp>
+#include <transformations/serialize.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+using namespace ngraph;
+using namespace opset7;
+using namespace std;
+
+Output<Node> create_init_subgraph(const Output<Node>& in_node) {
+    auto const_zero = make_shared<Constant>(in_node.get_element_type(), Shape{1}, 0);
+    auto shape_of = make_shared<ShapeOf>(in_node);
+    auto broadcast = make_shared<Broadcast>(const_zero, shape_of);
+    return broadcast->output(0);
+}
+
+Output<Node> insert_identity(const Output<Node>& in_node) {
+    auto axis_1 = Constant::create(element::i64, Shape{1}, {1});
+    auto identity_1 = std::make_shared<Unsqueeze>(in_node, axis_1);
+    return std::make_shared<Squeeze>(identity_1, axis_1);
+}
+
+std::shared_ptr<Function> createLSTMBody(const std::shared_ptr<Parameter>& Xi,
+                                                 const std::shared_ptr<Parameter>& H_t,
+                                                 const std::shared_ptr<Parameter>& C_t,
+                                                 bool is_loop = false) {
+    // Body
+    auto axis = Constant::create(element::i64, Shape{}, {0});
+    auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+    auto w_val = std::vector<float>(512 * 16, 0);
+    auto r_val = std::vector<float>(512 * 128, 0);
+    auto b_val = std::vector<float>(512, 0);
+    auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+    auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+    auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+    auto lstm_cell = std::make_shared<LSTMCell>(squeeze, H_t, C_t, W, R, B, 128);
+    auto res_1 = std::make_shared<Result>(lstm_cell->output(0));
+    auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+    auto res_2 = std::make_shared<Result>(unsqueeze);
+    auto res_3 = std::make_shared<Result>(lstm_cell->output(1));
+
+    auto func = std::make_shared<Function>(OutputVector{res_1, res_2, res_3},
+                                           ParameterVector{Xi, H_t, C_t});
+    if (is_loop) {
+        auto body_condition = std::make_shared<Constant>(
+                element::boolean, Shape{1}, true);
+        auto cond_res = std::make_shared<Result>(body_condition);
+        func->add_results({cond_res});
+    }
+    return func;
+}
+
+TEST(TransformationTests, LowLatency2_LSTM) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto body = createLSTMBody(Xi, H_t, C_t);
+        auto results = body->get_results();
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+        tensor_iterator->set_friendly_name("LSTMTensorIterator");
+
+        tensor_iterator->set_merged_input(C_t, C_init, results[2]);
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(H_t, H_init, results[0]);
+
+        tensor_iterator->get_iter_value(results[0], -1);
+        tensor_iterator->get_concatenated_slices(results[1], 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(tensor_iterator->output(1));
+        auto res_ti_2 = std::make_shared<Result>(tensor_iterator->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2},
+                                               ParameterVector{X, H_init, C_init});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("LSTMTensorIterator/variable0");
+        const std::string variable_name_C("LSTMTensorIterator/variable1");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto variable_C = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_C});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H_t), variable_H);
+        auto read_value_C = std::make_shared<ReadValue>(create_init_subgraph(C_t), variable_C);
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, read_value_H, read_value_C, W, R, B, 128);
+        auto assign_H = std::make_shared<Assign>(lstm_cell->output(0), variable_H);
+        auto assign_C = std::make_shared<Assign>(lstm_cell->output(1), variable_C);
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(insert_identity(unsqueeze));
+        auto res_1 = std::make_shared<Result>(insert_identity(lstm_cell->output(0)));
+        f_ref = std::make_shared<Function>(OutputVector{res_1, res_2}, ParameterVector{Xi, H_t, C_t});
+        f_ref->add_sinks({assign_C, assign_H});
+        assign_H->add_control_dependency(read_value_H);
+        assign_C->add_control_dependency(read_value_C);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatency2_GRU) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto Y = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto Yi = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(384 * 16, 0);
+        auto r_val = std::vector<float>(384 * 128, 0);
+        auto b_val = std::vector<float>(384, 0);
+        auto W = Constant::create(element::f32, Shape{384, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{384, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{384}, b_val);
+
+        auto gru_cell = std::make_shared<GRUCell>(squeeze, Yi, W, R, B, 128);
+        auto res_1 = std::make_shared<Result>(gru_cell);
+        auto unsqueeze = std::make_shared<Unsqueeze>(gru_cell, axis);
+        auto res_2 = std::make_shared<Result>(unsqueeze);
+        auto body = std::make_shared<Function>(OutputVector{res_1, res_2}, ParameterVector{Xi, Yi});
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(Yi, Y, res_1);
+
+        auto out0 = tensor_iterator->get_iter_value(res_1, -1);
+        auto out1 = tensor_iterator->get_concatenated_slices(res_2, 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(tensor_iterator->output(1));
+        f = std::make_shared<Function>(NodeVector{res_ti_1}, ParameterVector{X, Y});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+
+        manager.run_passes(f);
+
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("GRUTensorIterator/variable0");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H_t), variable_H);
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(384 * 16, 0);
+        auto r_val = std::vector<float>(384 * 128, 0);
+        auto b_val = std::vector<float>(384, 0);
+        auto W = Constant::create(element::f32, Shape{384, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{384, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{384}, b_val);
+
+        auto rnn_cell = std::make_shared<GRUCell>(squeeze, read_value_H, W, R, B, 128);
+        auto assign_H = std::make_shared<Assign>(rnn_cell->output(0), variable_H);
+        auto res_1 = std::make_shared<Result>(assign_H);
+        auto unsqueeze = std::make_shared<Unsqueeze>(rnn_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(insert_identity(unsqueeze));
+        f_ref = std::make_shared<Function>(ResultVector {res_2}, ParameterVector{Xi, H_t});
+        f_ref->add_sinks({assign_H});
+        assign_H->add_control_dependency(read_value_H);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatency2_RNN) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto Y = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto Yi = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(128 * 16, 0);
+        auto r_val = std::vector<float>(128 * 128, 0);
+        auto b_val = std::vector<float>(128, 0);
+        auto W = Constant::create(element::f32, Shape{128, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{128, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{128}, b_val);
+
+        auto rnn_cell = std::make_shared<RNNCell>(squeeze, Yi, W, R, B, 128);
+        auto res_1 = std::make_shared<Result>(rnn_cell);
+        auto unsqueeze = std::make_shared<Unsqueeze>(rnn_cell, axis);
+        auto res_2 = std::make_shared<Result>(unsqueeze);
+        auto body = std::make_shared<Function>(OutputVector{res_1, res_2}, ParameterVector{Xi,
+                                                                                                   Yi});
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(Yi, Y, res_1);
+
+        auto out0 = tensor_iterator->get_iter_value(res_1, -1);
+        auto out1 = tensor_iterator->get_concatenated_slices(res_2, 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(tensor_iterator->output(1));
+        f = std::make_shared<Function>(NodeVector{res_ti_1}, ParameterVector{X, Y});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+
+        manager.run_passes(f);
+
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("RNNTensorIterator/variable0");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H_t), variable_H);
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(128 * 16, 0);
+        auto r_val = std::vector<float>(128 * 128, 0);
+        auto b_val = std::vector<float>(128, 0);
+        auto W = Constant::create(element::f32, Shape{128, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{128, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{128}, b_val);
+
+        auto rnn_cell = std::make_shared<RNNCell>(squeeze, read_value_H, W, R, B, 128);
+        auto assign_H = std::make_shared<Assign>(rnn_cell->output(0), variable_H);
+        auto res_1 = std::make_shared<Result>(assign_H);
+        auto unsqueeze = std::make_shared<Unsqueeze>(rnn_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(insert_identity(unsqueeze));
+        f_ref = std::make_shared<Function>(ResultVector{res_2}, ParameterVector{Xi, H_t});
+        f_ref->add_sinks({assign_H});
+        assign_H->add_control_dependency(read_value_H);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatency2_LSTMReshape) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{2, 1, 16});
+        auto H = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto body = createLSTMBody(Xi, H_t, C_t);
+        auto results = body->get_results();
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+
+        tensor_iterator->set_merged_input(C_t, C, results[2]);
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(H_t, H, results[0]);
+
+        auto out0 = tensor_iterator->get_iter_value(results[0], -1);
+        auto out1 = tensor_iterator->get_concatenated_slices(results[1], 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(tensor_iterator->output(1));
+        auto res_ti_2 = std::make_shared<Result>(tensor_iterator->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2}, ParameterVector{X, H,
+                                                                                                               C});
+
+        // Reshape
+        // change the number of iteration of TI. 2 -> 1
+        auto new_X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        f->replace_parameter(0, new_X);
+        f->validate_nodes_and_infer_types();
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("LSTMTensorIterator/variable0");
+        const std::string variable_name_C("LSTMTensorIterator/variable1");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto variable_C = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_C});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H_t), variable_H);
+        auto read_value_C = std::make_shared<ReadValue>(create_init_subgraph(C_t), variable_C);
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, read_value_H, read_value_C, W, R, B, 128);
+        auto assign_H = std::make_shared<Assign>(lstm_cell->output(0), variable_H);
+        auto assign_C = std::make_shared<Assign>(lstm_cell->output(1), variable_C);
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(insert_identity(unsqueeze));
+        auto res_1 = std::make_shared<Result>(insert_identity(lstm_cell->output(0)));
+        f_ref = std::make_shared<Function>(OutputVector{res_1, res_2}, ParameterVector{Xi, H_t, C_t});
+        f_ref->add_sinks({assign_C, assign_H});
+        assign_H->add_control_dependency(read_value_H);
+        assign_C->add_control_dependency(read_value_C);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatency2_LSTM_Loop) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        // Body
+        auto body = createLSTMBody(Xi, H_t, C_t, true);
+        auto results = body->get_results();
+
+        auto trip_count =
+                std::make_shared<Constant>(element::i64, Shape{}, 1);
+        auto exec_condition =
+                std::make_shared<Constant>(element::boolean, Shape{}, true);
+        auto loop = std::make_shared<Loop>(trip_count, exec_condition);
+        loop->set_special_body_ports({-1, 3});
+        loop->set_function(body);
+        loop->set_friendly_name("LSTMLoop");
+
+        loop->set_merged_input(C_t, C_init, results[2]);
+        loop->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        loop->set_merged_input(H_t, H_init, results[0]);
+
+        auto out0 = loop->get_iter_value(results[0], -1);
+        auto out1 = loop->get_concatenated_slices(results[1], 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(loop->output(1));
+        auto res_ti_2 = std::make_shared<Result>(loop->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2},
+                                               ParameterVector{X, H_init, C_init});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("LSTMTensorIterator/variable0");
+        const std::string variable_name_C("LSTMTensorIterator/variable1");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto variable_C = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_C});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H_t), variable_H);
+        auto read_value_C = std::make_shared<ReadValue>(create_init_subgraph(C_t), variable_C);
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, read_value_H, read_value_C, W, R, B, 128);
+        auto assign_H = std::make_shared<Assign>(lstm_cell->output(0), variable_H);
+        auto assign_C = std::make_shared<Assign>(lstm_cell->output(1), variable_C);
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(insert_identity(unsqueeze));
+        auto res_1 = std::make_shared<Result>(insert_identity(lstm_cell->output(0)));
+        f_ref = std::make_shared<Function>(OutputVector{res_1, res_2}, ParameterVector{Xi, H_t, C_t});
+        f_ref->add_sinks({assign_C, assign_H});
+        assign_H->add_control_dependency(read_value_H);
+        assign_C->add_control_dependency(read_value_C);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatency2_LSTM_several_iterations) {
+    constexpr int ITER_CNT = 5;
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{ITER_CNT, 1, 16});
+        auto H = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto body = createLSTMBody(Xi, H_t, C_t);
+        auto results = body->get_results();
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+
+        tensor_iterator->set_merged_input(C_t, C, results[2]);
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(H_t, H, results[0]);
+
+        auto out0 = tensor_iterator->get_iter_value(results[0], -1);
+        auto out1 = tensor_iterator->get_concatenated_slices(results[1], 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(tensor_iterator->output(1));
+        auto res_ti_2 = std::make_shared<Result>(tensor_iterator->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2}, ParameterVector{X, H,
+                                                                                       C});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    // TensorIterator not unrolled.
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{ITER_CNT, 1, 16});
+        auto H = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("LSTMTensorIterator/variable0");
+        const std::string variable_name_C("LSTMTensorIterator/variable1");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto variable_C = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_C});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H), variable_H);
+        auto read_value_C = std::make_shared<ReadValue>(create_init_subgraph(C), variable_C);
+
+        // Body
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, H_t, C_t, W, R, B, 128);
+        auto res_1 = std::make_shared<Result>(lstm_cell->output(0));
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell, axis);
+        auto res_2 = std::make_shared<Result>(unsqueeze);
+        auto res_3 = std::make_shared<Result>(lstm_cell->output(1));
+        auto body = std::make_shared<Function>(OutputVector{res_1, res_2, res_3},
+                                               ParameterVector{Xi, H_t, C_t});
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+
+        tensor_iterator->set_merged_input(C_t, read_value_C, res_3);
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(H_t, read_value_H, res_1);
+
+        auto out0 = tensor_iterator->get_iter_value(res_1, -1);
+        auto out1 = tensor_iterator->get_concatenated_slices(res_2, 0, 1, 1, -1, 0);
+        auto out2 = tensor_iterator->get_iter_value(res_3, -1);
+
+        auto assign_H = std::make_shared<Assign>(out0, variable_H);
+        auto assign_C = std::make_shared<Assign>(out2, variable_C);
+        auto outer_res_2 = std::make_shared<Result>(out1);
+        auto outer_res_1 = std::make_shared<Result>(out0);
+        f_ref = std::make_shared<Function>(OutputVector{outer_res_1, outer_res_2}, ParameterVector{X, H, C});
+        f_ref->add_sinks({assign_C, assign_H});
+        assign_H->add_control_dependency(read_value_H);
+        assign_C->add_control_dependency(read_value_C);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatency2_LSTM_Loop_Reshape) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{10, 1, 16});
+        auto H_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto body = createLSTMBody(Xi, H_t, C_t, true);
+        auto results = body->get_results();
+
+        auto shape_of = std::make_shared<ShapeOf>(X);
+        const auto trip_count = std::make_shared<Gather>(shape_of, Constant::create(ngraph::element::i64, {1}, {0}),
+                                                         Constant::create(ngraph::element::i64, {1}, {0}));
+        auto exec_condition =
+                std::make_shared<Constant>(element::boolean, Shape{}, true);
+        auto loop = std::make_shared<Loop>(trip_count, exec_condition);
+        loop->set_special_body_ports({-1, 3});
+        loop->set_function(body);
+        loop->set_friendly_name("LSTMLoop");
+
+        loop->set_merged_input(C_t, C_init, results[2]);
+        loop->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        loop->set_merged_input(H_t, H_init, results[0]);
+
+        auto out0 = loop->get_iter_value(results[0], -1);
+        auto out1 = loop->get_concatenated_slices(results[1], 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(loop->output(1));
+        auto res_ti_2 = std::make_shared<Result>(loop->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2},
+                                       ParameterVector{X, H_init, C_init});
+
+        // Reshape
+        // change the number of iteration of Loop. 10 -> 1
+        auto new_X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        f->replace_parameter(0, new_X);
+        f->validate_nodes_and_infer_types();
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>();
+
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("LSTMTensorIterator/variable0");
+        const std::string variable_name_C("LSTMTensorIterator/variable1");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto variable_C = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_C});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H_t), variable_H);
+        auto read_value_C = std::make_shared<ReadValue>(create_init_subgraph(C_t), variable_C);
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, read_value_H, read_value_C, W, R, B, 128);
+        auto assign_H = std::make_shared<Assign>(lstm_cell->output(0), variable_H);
+        auto assign_C = std::make_shared<Assign>(lstm_cell->output(1), variable_C);
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(insert_identity(unsqueeze));
+        auto res_1 = std::make_shared<Result>(insert_identity(lstm_cell->output(0)));
+        f_ref = std::make_shared<Function>(OutputVector{res_1, res_2}, ParameterVector{Xi, H_t, C_t});
+        f_ref->add_sinks({assign_C, assign_H});
+        assign_H->add_control_dependency(read_value_H);
+        assign_C->add_control_dependency(read_value_C);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+
+TEST(TransformationTests, LowLatency2_LSTM_Loop_several_iterations) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{10, 1, 16});
+        auto H_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto body = createLSTMBody(Xi, H_t, C_t, true);
+        auto results = body->get_results();
+
+        auto trip_count =
+                std::make_shared<Constant>(element::i64, Shape{}, 10);
+        auto exec_condition =
+                std::make_shared<Constant>(element::boolean, Shape{}, true);
+        auto loop = std::make_shared<Loop>(trip_count, exec_condition);
+        loop->set_special_body_ports({-1, 3});
+        loop->set_function(body);
+        loop->set_friendly_name("LSTMLoop");
+
+        loop->set_merged_input(C_t, C_init, results[2]);
+        loop->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        loop->set_merged_input(H_t, H_init, results[0]);
+
+        auto out0 = loop->get_iter_value(results[0], -1);
+        auto out1 = loop->get_concatenated_slices(results[1], 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(loop->output(1));
+        auto res_ti_2 = std::make_shared<Result>(loop->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2},
+                                       ParameterVector{X, H_init, C_init});
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        manager.register_pass<pass::LowLatency2>(true);
+
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{10, 1, 16});
+        auto H = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        const std::string variable_name_H("LSTMTensorIterator/variable0");
+        const std::string variable_name_C("LSTMTensorIterator/variable1");
+        auto variable_H = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_H});
+        auto variable_C = std::make_shared<Variable>(VariableInfo{PartialShape::dynamic(), element::dynamic, variable_name_C});
+        auto read_value_H = std::make_shared<ReadValue>(create_init_subgraph(H), variable_H);
+        auto read_value_C = std::make_shared<ReadValue>(create_init_subgraph(C), variable_C);
+
+        // Body
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, H_t, C_t, W, R, B, 128);
+        auto res_1 = std::make_shared<Result>(lstm_cell->output(0));
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(unsqueeze);
+        auto res_3 = std::make_shared<Result>(lstm_cell->output(1));
+        auto body_condition = std::make_shared<Constant>(
+                element::boolean, Shape{1}, true);
+        auto body = std::make_shared<Function>(OutputVector{res_1, res_2, res_3, body_condition},
+                                               ParameterVector{Xi, H_t, C_t});
+
+        auto trip_count =
+                std::make_shared<Constant>(element::i64, Shape{}, 10);
+        auto exec_condition =
+                std::make_shared<Constant>(element::boolean, Shape{}, true);
+        auto loop = std::make_shared<Loop>(trip_count, exec_condition);
+        loop->set_special_body_ports({-1, 3});
+        loop->set_function(body);
+        loop->set_friendly_name("LSTMLoop");
+
+        loop->set_merged_input(C_t, read_value_C, res_3);
+        loop->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        loop->set_merged_input(H_t, read_value_H, res_1);
+
+        auto out0 = loop->get_iter_value(res_1, -1);
+        auto out1 = loop->get_concatenated_slices(res_2, 0, 1, 1, -1, 0);
+        auto out3 = loop->get_iter_value(res_3, -1);
+
+        auto assign_H = std::make_shared<Assign>(out0, variable_H);
+        auto assign_C = std::make_shared<Assign>(out3, variable_C);
+        auto outer_res_2 = std::make_shared<Result>(out1);
+        auto outer_res_1 = std::make_shared<Result>(out0);
+        f_ref = std::make_shared<Function>(OutputVector{outer_res_1, outer_res_2}, ParameterVector{X, H, C});
+        f_ref->add_sinks({assign_C, assign_H});
+        assign_H->add_control_dependency(read_value_H);
+        assign_C->add_control_dependency(read_value_C);
+    }
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, LowLatencyLSTM_LLTv1_LLTv2) {
+    std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_init = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        auto Xi = std::make_shared<Parameter>(element::f32, Shape{1, 1, 16});
+        auto H_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+        auto C_t = std::make_shared<Parameter>(element::f32, Shape{1, 128});
+
+        // Body
+        auto axis = Constant::create(element::i64, Shape{}, {0});
+        auto squeeze = std::make_shared<Squeeze>(Xi, axis);
+
+        auto w_val = std::vector<float>(512 * 16, 0);
+        auto r_val = std::vector<float>(512 * 128, 0);
+        auto b_val = std::vector<float>(512, 0);
+        auto W = Constant::create(element::f32, Shape{512, 16}, w_val);
+        auto R = Constant::create(element::f32, Shape{512, 128}, r_val);
+        auto B = Constant::create(element::f32, Shape{512}, b_val);
+
+        auto lstm_cell = std::make_shared<LSTMCell>(squeeze, H_t, C_t, W, R, B, 128);
+        auto res_1 = std::make_shared<Result>(lstm_cell->output(0));
+        auto unsqueeze = std::make_shared<Unsqueeze>(lstm_cell->output(0), axis);
+        auto res_2 = std::make_shared<Result>(unsqueeze);
+        auto res_3 = std::make_shared<Result>(lstm_cell->output(1));
+        auto body = std::make_shared<Function>(OutputVector{res_1, res_2, res_3}, ParameterVector{Xi, H_t, C_t});
+
+        auto tensor_iterator = std::make_shared<TensorIterator>();
+        tensor_iterator->set_body(body);
+        tensor_iterator->set_friendly_name("LSTMTensorIterator");
+
+        tensor_iterator->set_merged_input(C_t, C_init, res_3);
+        tensor_iterator->set_sliced_input(Xi, X, 0, 1, 1, -1, 0);
+        tensor_iterator->set_merged_input(H_t, H_init, res_1);
+
+        auto out0 = tensor_iterator->get_iter_value(res_1, -1);
+        auto out1 = tensor_iterator->get_concatenated_slices(res_2, 0, 1, 1, -1, 0);
+
+        auto res_ti_1 = std::make_shared<Result>(tensor_iterator->output(1));
+        auto res_ti_2 = std::make_shared<Result>(tensor_iterator->output(0));
+        f = std::make_shared<Function>(NodeVector{res_ti_1, res_ti_2},
+                                               ParameterVector{X, H_init, C_init});
+
+        auto f_2 = ngraph::clone_function(*f);
+        pass::Manager manager_2;
+        manager_2.register_pass<pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
+        manager_2.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
+        EXPECT_NO_THROW(manager_2.run_passes(f_2));
+
+        pass::Manager manager;
+        manager.register_pass<pass::InitNodeInfo>();
+        NGRAPH_SUPPRESS_DEPRECATED_START
+        manager.register_pass<ngraph::pass::LowLatency>();
+        NGRAPH_SUPPRESS_DEPRECATED_END
+        // LLT v2 doesn't insert Assign/ReadValue ops, they are already inserted
+        // but unrolls TI/Loop
+        manager.register_pass<pass::LowLatency2>();
+
+        EXPECT_NO_THROW(manager.run_passes(f));
+    }
+}
--- a/inference-engine/tests/functional/inference_engine/transformations/split_squeeze_concat_fusion_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/split_squeeze_concat_fusion_test.cpp
@@ -0,0 +1,205 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/common_optimizations/split_squeeze_concat_fusion.hpp>
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+
+TEST(TransformationTests, SplitSqueezeConcatFusion) {
+    size_t num_splits = 4;
+
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 2 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::SplitSqueezeConcatFusion>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto transpose_order = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 6 }, { 0, 1, 3, 4, 2, 5 });
+        auto transpose = std::make_shared<ngraph::opset7::Transpose>(input, transpose_order);
+        auto reshape_shape = ngraph::opset7::Constant::create<int64_t>(ngraph::element::i64, ngraph::Shape{ 5 },
+                                                                       { 1, 2, 640, 20, 2 * (int64_t)num_splits });
+        auto reshape = std::make_shared<ngraph::opset7::Reshape>(transpose, reshape_shape, false);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ reshape }, ngraph::ParameterVector{ input });
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, SplitSqueezeConcatFusionNegativeCaseNotAllSplitOutputsGoToSqueeze) {
+    size_t num_splits = 4;
+
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits - 1);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 2 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::SplitSqueezeConcatFusion>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits - 1);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 2 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, SplitSqueezeConcatFusionNegativeCaseSplitOutputsGoInDifferentOrder) {
+    size_t num_splits = 4;
+
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 2 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        std::swap(squeeze_vec[1], squeeze_vec[2]);
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::SplitSqueezeConcatFusion>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 2 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        std::swap(squeeze_vec[1], squeeze_vec[2]);
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, SplitSqueezeConcatFusionNegativeCaseSplitAxisDifferentFromSqueezeAxis) {
+    size_t num_splits = 4;
+
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 0 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::SplitSqueezeConcatFusion>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto input = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, num_splits, 640, 20, 2 });
+        auto split_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, { 2 });
+        auto split = std::make_shared<ngraph::opset7::Split>(input, split_axis, num_splits);
+        ngraph::OutputVector squeeze_vec(num_splits);
+
+        for (size_t i = 0; i < squeeze_vec.size(); i++) {
+            auto squeeze_axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{ 1 }, { 0 });
+            squeeze_vec[i] = std::make_shared<ngraph::opset7::Squeeze>(split->output(i), squeeze_axis)->output(0);
+        }
+
+        auto concat = std::make_shared<ngraph::opset7::Concat>(squeeze_vec, 4);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ concat }, ngraph::ParameterVector{ input });
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
--- a/inference-engine/tests/functional/inference_engine/transformations/transpose_sinking_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/transpose_sinking_test.cpp
@@ -201,3 +201,68 @@ INSTANTIATE_TEST_CASE_P(TransposeSinkingSqueeze, TransposeSinking, testing::Comb
        testing::Values(
            ngraph::opset6::Squeeze::type_info)));

+TEST(TransformationTests, TransposeFuseEliminatesTranspose) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto input = std::make_shared<ngraph::opset6::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, 640, 20, 2 });
+        auto tr1_order = ngraph::opset6::Constant::create(ngraph::element::i64, ngraph::Shape{ 5 }, { 0, 2, 3, 4, 1 });
+        auto transpose1 = std::make_shared<ngraph::opset6::Transpose>(input, tr1_order);
+        auto tr2_order = ngraph::opset6::Constant::create(ngraph::element::i64, ngraph::Shape{ 5 }, { 0, 4, 1, 2, 3 });
+        auto transpose2 = std::make_shared<ngraph::opset6::Transpose>(transpose1, tr2_order);
+        auto add_const = ngraph::opset6::Constant::create(ngraph::element::f32, ngraph::Shape{ 1 }, { 1 });
+        auto add = std::make_shared<ngraph::opset6::Add>(transpose2, add_const);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ add }, ngraph::ParameterVector{ input });
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::TransposeFuse>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto input = std::make_shared<ngraph::opset6::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, 640, 20, 2 });
+        auto add_const = ngraph::opset6::Constant::create(ngraph::element::f32, ngraph::Shape{ 1 }, { 1 });
+        auto add = std::make_shared<ngraph::opset6::Add>(input, add_const);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ add }, ngraph::ParameterVector{ input });
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, TransposeFuses) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto input = std::make_shared<ngraph::opset6::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, 640, 20, 2, 2 });
+        auto tr1_order = ngraph::opset6::Constant::create(ngraph::element::i64, ngraph::Shape{ 6 }, { 0, 5, 1, 2, 3, 4 });
+        auto transpose1 = std::make_shared<ngraph::opset6::Transpose>(input, tr1_order);
+        auto tr2_order = ngraph::opset6::Constant::create(ngraph::element::i64, ngraph::Shape{ 6 }, { 0, 1, 3, 4, 2, 5 });
+        auto transpose2 = std::make_shared<ngraph::opset6::Transpose>(transpose1, tr2_order);
+        auto add_const = ngraph::opset6::Constant::create(ngraph::element::f32, ngraph::Shape{ 1 }, { 1 });
+        auto add = std::make_shared<ngraph::opset6::Add>(transpose2, add_const);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{ add }, ngraph::ParameterVector{ input });
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::TransposeFuse>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto input = std::make_shared<ngraph::opset6::Parameter>(ngraph::element::f32, ngraph::Shape{ 1, 2, 640, 20, 2, 2 });
+        auto tr_order = ngraph::opset6::Constant::create(ngraph::element::i64, ngraph::Shape{ 6 }, { 0, 5, 2, 3, 1, 4 });
+        auto transpose = std::make_shared<ngraph::opset6::Transpose>(input, tr_order);
+        auto add_const = ngraph::opset6::Constant::create(ngraph::element::f32, ngraph::Shape{ 1 }, { 1 });
+        auto add = std::make_shared<ngraph::opset6::Add>(transpose, add_const);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{ add }, ngraph::ParameterVector{ input });
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
--- a/Show More
+++ b/Show More