Merge remote-tracking branch 'upstream/master' into revise_negative

2021-06-10 10:14:20 +02:00 · 2021-06-10 10:14:20 +02:00 · 05226050eb
commit 05226050eb
parent a5eccf2bc1 17a899ed0c
39 changed files with 1543 additions and 170 deletions
--- a/.ci/azure/linux.yml
+++ b/.ci/azure/linux.yml
@ -112,6 +112,7 @@ jobs:
        -DNGRAPH_ONNX_IMPORT_ENABLE=ON
        -DNGRAPH_ONNX_EDITOR_ENABLE=ON
        -DENABLE_FASTER_BUILD=ON
+        -DENABLE_STRICT_DEPENDENCIES=OFF
        -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules
        $(REPO_DIR)
      workingDirectory: $(BUILD_DIR)
--- a/.ci/azure/mac.yml
+++ b/.ci/azure/mac.yml
@ -90,7 +90,7 @@ jobs:
      # Disable errors with Ninja
      export CXXFLAGS="-Wno-error=unused-command-line-argument"
      export CFLAGS="-Wno-error=unused-command-line-argument"
-      cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR)
+      cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR)
    workingDirectory: $(BUILD_DIR)
    displayName: 'CMake'

--- a/.ci/azure/windows.yml
+++ b/.ci/azure/windows.yml
@ -92,7 +92,7 @@ jobs:

  - script: |
      set PATH=$(WORK_DIR)\ninja-win;%PATH%
-      call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR)
+      call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR)
    workingDirectory: $(BUILD_DIR)
    displayName: 'CMake'

--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@ -6,7 +6,7 @@ ie_dependent_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON "X8

 ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF)

-ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" OFF)
+ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" ON)

 ie_dependent_option (ENABLE_CLDNN "clDnn based plugin for inference engine" ON "X86_64;NOT APPLE;NOT MINGW;NOT WINDOWS_STORE;NOT WINDOWS_PHONE" OFF)

--- a/docs/IE_DG/Intro_to_Performance.md
+++ b/docs/IE_DG/Intro_to_Performance.md
@ -22,7 +22,8 @@ $ benchmark_app -m <model.xml> -enforcebf16=false
 Notice that for quantized (e.g. INT8) models the bfloat16 calculations (of the layers that remain in FP32) is disabled by default.
 Refer to the [CPU Plugin documentation](supported_plugins/CPU.md) for more details.

-Similarly, the GPU device has a dedicated config key to enable FP16 execution of the layers that remain in FP32 in the quantized models (as the quantization is typically performed on the FP32 models), refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md)
+Similarly, the GPU device automatically executes FP16 for the layers that remain in FP16 in the quantized models (assuming that the FP16 model was quantized).
+Refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md).

 ## Latency vs. Throughput
 One way to increase computational efficiency is batching, which combines many (potentially tens) of
--- a/inference-engine/include/ie_version.hpp
+++ b/inference-engine/include/ie_version.hpp
@ -20,8 +20,8 @@
 * @brief Defines Inference Engine patch version
 */

-#define IE_VERSION_MAJOR 2021
-#define IE_VERSION_MINOR 4
+#define IE_VERSION_MAJOR 2022
+#define IE_VERSION_MINOR 1
 #define IE_VERSION_PATCH 0

 #include "ie_api.h"
--- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
+++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
@ -10,13 +10,18 @@
 namespace GNAPluginNS {
 namespace GNALimitations {

+constexpr uint32_t bufferMaxSize = 65528;
+
 constexpr uint32_t convMinFiltersNum = 4;
 constexpr uint32_t convMaxFiltersNum = 65532;
 constexpr uint32_t convFiltersNumDivider = 4;
+constexpr uint32_t convFilterMaxSize = 768;
 constexpr uint32_t convEachKernelByteAlignment = 16;
 constexpr uint32_t noOfInputsDivisor = 8;
 constexpr uint32_t noOfInputsLowPrecDivisor = 16;

+constexpr uint32_t affineMaxBatchSize = 8;
+
 namespace Cnn2D {
 struct RangeLimit {
    uint32_t min;
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@ -370,14 +370,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
            auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front();
            auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
            auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
-            auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue));

            result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
-            if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) {
-                result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax);
-            }
-            //
-            //result = MAX_VAL_2B_FEAT / absMax;
            if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
                result = max_activation_scale_factor;
            }
@ -401,6 +395,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
                auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
                if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
+                    prevLayerQuant->_src_quant.IsStatsSet() &&
                    (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
                    result = prevLayerQuant->_src_quant.GetScale();
                    usePrevScaleFactor = true;
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@ -158,25 +158,27 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer)
                THROW_GNA_LAYER_EXCEPTION(layer) << " outData["<< i << "]" << " connected by " << j <<" connection doesnt connect to functional layer";
            }

-            auto dataOutput = outFunctionalLayer.first->insData[outFunctionalLayer.second].lock();
+            for (int idx : outFunctionalLayer.second) {
+                auto dataOutput = outFunctionalLayer.first->insData[idx].lock();

-            padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize())
-                                                        * dataOutput->getPrecision().size();
-            output_layer_size =
-                    InferenceEngine::details::product(begin(dataOutput->getDims()),
-                                                     end(dataOutput->getDims())) * dataOutput->getPrecision().size();
+                padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize())
+                                                            * dataOutput->getPrecision().size();
+                output_layer_size =
+                        InferenceEngine::details::product(begin(dataOutput->getDims()),
+                                                        end(dataOutput->getDims())) * dataOutput->getPrecision().size();

-            if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) {
-                size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset");
-                layerInfoItem.splitOutputLayers.emplace_back(
-                    outFunctionalLayer.first,
-                    outFunctionalLayer.second,
-                    aligned64_offset * dataOutput->getPrecision().size(),
-                    output_layer_size);
-            } else {
-                layerInfoItem.splitOutputLayers.emplace_back(
-                    outFunctionalLayer.first, outFunctionalLayer.second, split_size, output_layer_size);
-            }
+                if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) {
+                    size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset");
+                    layerInfoItem.splitOutputLayers.emplace_back(
+                        outFunctionalLayer.first,
+                        idx,
+                        aligned64_offset * dataOutput->getPrecision().size(),
+                        output_layer_size);
+                } else {
+                    layerInfoItem.splitOutputLayers.emplace_back(
+                        outFunctionalLayer.first, idx, split_size, output_layer_size);
+                }
+             }
        }

        // in case of unconnected split - we need properly increment size
--- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
@ -155,14 +155,14 @@ inline InferenceEngine::CNNLayerPtr  CNNNetPrevLayerSkipCertain(Layer layer, int
 */

 template <class Layer>
-inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck,
+inline std::pair<InferenceEngine::CNNLayerPtr, std::vector<int>>  CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck,
                                                                const std::function<bool(CNNLayerPtr)> &shouldSkip) {
    if (oidx >= layer->outData.size()) {
-        if (bOnlyCheck) return {nullptr, 0};
+        if (bOnlyCheck) return {nullptr, {}};
        THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx;
    }
    if (getInputTo(layer->outData[oidx]).empty() || iidx >= getInputTo(layer->outData[oidx]).size()) {
-        if (bOnlyCheck) return {nullptr, 0};
+        if (bOnlyCheck) return {nullptr, {}};
        THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx << " and inputTo index: " << iidx;
    }

@ -174,12 +174,12 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer

    while (shouldSkip(outLayer->second)) {
        if (outLayer->second->outData.size() <= new_oidx) {
-            if (bOnlyCheck) return { nullptr, 0 };
+            if (bOnlyCheck) return { nullptr, {} };
            THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx;
        }

        if (getInputTo(outLayer->second->outData[new_oidx]).size() <= new_iidx) {
-            if (bOnlyCheck) return { nullptr, 0 };
+            if (bOnlyCheck) return { nullptr, {} };
            THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx << " and inputTo index: " << new_iidx;
        }

@ -188,11 +188,7 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer
    }

    auto insDataIdx = CNNLayerFindInsDataIdxes(layer->outData[new_oidx], outLayer->second);
-    if (insDataIdx.size() != 1) {
-        if (bOnlyCheck) return { nullptr, 0 };
-        THROW_GNA_LAYER_EXCEPTION(layer) << " has multiple connection to " << new_oidx << " outData";
-    }
-    return { outLayer->second, insDataIdx.front() };
+    return { outLayer->second, insDataIdx };
 }

 /**
@ -256,7 +252,7 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer

 /// @brief alias for strict checkNextLayer (false)
 template <class Layer>
-inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx,
+inline std::pair<InferenceEngine::CNNLayerPtr, std::vector<int>>  CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx,
                                                                               const std::function<bool(CNNLayerPtr)> &shouldSkip) {
    return CNNNetCheckNextLayerSkipCertain(layer, oidx, iidx, false, shouldSkip);
 }
--- a/inference-engine/src/gna_plugin/gna_groups.hpp
+++ b/inference-engine/src/gna_plugin/gna_groups.hpp
@ -46,14 +46,10 @@ inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input
 * @param layer
 */
 inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
-    if (GNAPluginNS::LayerInfo(layer).isPower())
+    if (GNAPluginNS::LayerInfo(layer).isPower() || GNAPluginNS::LayerInfo(layer).isCopy())
        return true;

-    if (!GNAPluginNS::LayerInfo(layer).isScaleShift())
-        return false;
-
-    // Don't reshape user-defined ScaleShift layers
-    if (layer->name.rfind("SyntheticScaleShift", 0) == std::string::npos)
+    if (!GNAPluginNS::LayerInfo(layer).isSyntheticScaleShift())
        return false;

    // Don't reshape the first dnn layer since it breaks groups recognition
@ -61,8 +57,7 @@ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
        return LayerInfo(ptr).isNonValuesChangable();
    });
    IE_ASSERT(prevLayer != nullptr);
-    if (LayerInfo(prevLayer).isInput())
-        return false;
+    if (LayerInfo(prevLayer).isInput()) return false;

    // Don't reshape diagonallayers with bias connection
    return !GNAPluginNS::LayerInfo(getCreatorLayer(layer->insData.front().lock()).lock()).has32BOutput();
--- a/inference-engine/src/gna_plugin/gna_model_serial.cpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp
@ -17,6 +17,7 @@
 #include <mm_malloc.h>
 #include <serial/headers/2dot2/gna_model_header.hpp>
 #include <serial/headers/2dot5/gna_model_header.hpp>
+#include <serial/headers/2dot6/gna_model_header.hpp>

 #endif

@ -133,10 +134,11 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
                }
                case 5:
                case 6:
+                case 7:
                    readNBytes(&header, sizeof(HeaderLatest::ModelHeader), is);
                    break;
                default:
-                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 4 and is: " << header.version.minor;
+                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << header.version.minor;
            }
            break;
        default:
@ -154,6 +156,40 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
    return header;
 }

+GNAPluginNS::HeaderLatest::RuntimeEndPoint GNAModelSerial::ReadEndPoint(std::istream &is) {
+    is.exceptions(std::istream::failbit);
+
+    HeaderLatest::RuntimeEndPoint endPoint;
+    switch (modelHeader.version.major) {
+        case 2:
+            switch (modelHeader.version.minor) {
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                {
+                    Header2dot6::RuntimeEndPoint tempEndPoint2dot6;
+                    readBits(tempEndPoint2dot6, is);
+                    endPoint = HeaderLatest::RuntimeEndPoint(tempEndPoint2dot6, modelHeader.nGroup);
+                    break;
+                }
+                case 7:
+                    readNBytes(&endPoint, sizeof(HeaderLatest::RuntimeEndPoint), is);
+                    break;
+                default:
+                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << modelHeader.version.minor;
+            }
+            break;
+        default:
+            THROW_GNA_EXCEPTION << "Imported file unsupported. Import for files with major version equal to: "
+            << modelHeader.version.major << " is not implemented";
+    }
+
+    return endPoint;
+}
+
 #define offsetFromBase(field)\
 getOffsetFromBase(field, #field)

@ -324,18 +360,6 @@ void GNAModelSerial::Import(void *basePointer,
    is.read(reinterpret_cast<char*>(basePointer), gnaGraphSize);
 }

-
-uint32_t guessGrouping(Gna2Model const& model) {
-    if (model.NumberOfOperations == 0 ||
-        model.Operations == nullptr ||
-        model.Operations[0].Operands == nullptr ||
-        model.Operations[0].NumberOfOperands == 0 ||
-        model.Operations[0].Operands[0]->Shape.NumberOfDimensions < 2) {
-        THROW_GNA_EXCEPTION << "Can not guess grouping";
-    }
-    return (std::min)(model.Operations[0].Operands[0]->Shape.Dimensions[0], model.Operations[0].Operands[0]->Shape.Dimensions[1]);
-}
-
 void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const {
    os.exceptions(std::ostream::failbit);

@ -366,6 +390,9 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
        out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
        out.scaleFactor = ep.scaleFactor;
        out.element_size = ep.element_size;
+        out.shape = ep.shape;
+        out.layout = ep.layout;
+        out.precision = ep.precision;
        out.orientation = ep.orientation;
        return out;
    };
@ -381,7 +408,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
    header.headerSize = sizeof(HeaderLatest::ModelHeader);
    header.gnaMemSize = gnaGraphSize;
    header.layersCount = layers.size();
-    header.nGroup = guessGrouping(*gna2Model);
+    header.nGroup = 1; // just to support the old models
    header.nInputs = inputs.size();
    header.nOutputs = outputs.size();
    header.nTransposeInputs = transposeInputsInfo.size();
@ -796,13 +823,22 @@ std::vector<HeaderLatest::RuntimeEndPoint> GNAModelSerial::serializeOutputs(cons
    std::size_t outputIndex = 0;
    for (auto const &output : outputsDataMap) {
        auto outputName = output.first;
-        auto inputDims = output.second->getTensorDesc().getDims();
-        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(inputDims.begin(), inputDims.end()));
-
+        auto outputDims = output.second->getTensorDesc().getDims();
+        HeaderLatest::RuntimeEndPoint::Shape outputShape;
+        outputShape.NumberOfDimensions = outputDims.size();
+        for (size_t i=0; i < outputShape.NumberOfDimensions; ++i) {
+            outputShape.Dimensions[i] = static_cast<uint32_t>(outputDims[i]);
+        }
+        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(outputDims.begin(), outputDims.end()));
+        InferenceEngine::Layout outputLayout = output.second->getLayout();
+        InferenceEngine::Precision::ePrecision outputPrecision = InferenceEngine::Precision::FP32;
        HeaderLatest::RuntimeEndPoint endPoint(outputsDesc[outputIndex].scale_factor,
                                                 outputsDesc[outputIndex].ptrs[0],
                                                 outputsDesc[outputIndex].num_bytes_per_element,
                                                 elementsCount,
+                                                 outputShape,
+                                                 outputLayout,
+                                                 outputPrecision,
                                                 outputsDesc[outputIndex].orientation);
        endPoints.push_back(endPoint);
        outputIndex++;
@ -818,18 +854,26 @@ std::vector<HeaderLatest::RuntimeEndPoint> GNAModelSerial::serializeInputs(const
    for (auto const& input : inputsDataMap) {
        auto inputName = input.first;
        auto inputDims = input.second->getTensorDesc().getDims();
-
+        HeaderLatest::RuntimeEndPoint::Shape inputShape;
+        inputShape.NumberOfDimensions = inputDims.size();
+        for (size_t i=0; i < inputShape.NumberOfDimensions; ++i) {
+            inputShape.Dimensions[i] = static_cast<uint32_t>(inputDims[i]);
+        }
        double scaleFactor = inputDesc->getScaleFactor(inputIndex);
        std::vector<void *> descriptor_ptr = inputDesc->getPtrInputsGlobal(inputName);
        IE_ASSERT(descriptor_ptr.size() > 0);
        uint32_t element_size = 2u;
        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(inputDims.begin(), inputDims.end()));
        intel_dnn_orientation_t orientation = inputDesc->getOrientation(inputName);
-
+        InferenceEngine::Layout inputLayout = input.second->getLayout();
+        InferenceEngine::Precision::ePrecision inputPrecision = InferenceEngine::Precision::FP32;
        HeaderLatest::RuntimeEndPoint endPoint(scaleFactor,
                                                 descriptor_ptr[0],
                                                 element_size,
                                                 elementsCount,
+                                                 inputShape,
+                                                 inputLayout,
+                                                 inputPrecision,
                                                 orientation);
        endPoints.push_back(endPoint);
        inputIndex++;
@ -846,20 +890,24 @@ void GNAModelSerial::ImportInputs(std::istream &is,
    for (uint32_t inputIndex = 0; inputIndex < modelHeader.nInputs; inputIndex++) {
        const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3)
                ? inputNames.at(inputIndex) : std::string("input" + std::to_string(inputIndex));
-        HeaderLatest::RuntimeEndPoint input;
-        is.read(reinterpret_cast<char *>(&input), sizeof(input));
+
+        HeaderLatest::RuntimeEndPoint input = ReadEndPoint(is);
        inputsDesc->getPtrInputsGlobal(name).push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + input.descriptor_offset));
        inputsDesc->orientation_in[name] = input.orientation;
        inputsDesc->bytes_allocated_for_input[name] = input.element_size * input.elements_count;

-        auto inputDims = InferenceEngine::SizeVector({modelHeader.nGroup, input.elements_count / modelHeader.nGroup});
-
+        auto inputDims = InferenceEngine::SizeVector();
+        for (auto i = 0; i < input.shape.NumberOfDimensions; ++i) {
+            inputDims.push_back(input.shape.Dimensions[i]);
+        }
+        InferenceEngine::Layout inputLayout = static_cast<InferenceEngine::Layout>(input.layout);
+        InferenceEngine::Precision inputPresicion = InferenceEngine::Precision(static_cast<InferenceEngine::Precision::ePrecision>(input.precision));
        dataMap[name] = std::make_shared<InferenceEngine::InputInfo>();
        dataMap[name]->setInputData(std::make_shared<InferenceEngine::Data>(name,
                                                            InferenceEngine::TensorDesc(
-                                                                    InferenceEngine::Precision::FP32,
+                                                                    inputPresicion,
                                                                    inputDims,
-                                                                    InferenceEngine::Layout::NC)));
+                                                                    inputLayout)));
        inputsDesc->inputScaleFactors.push_back(input.scaleFactor);
    }
 }
@ -875,8 +923,8 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
    for (uint32_t outputIndex = 0; outputIndex < modelHeader.nOutputs; outputIndex++) {
        const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3)
                                  ? outputNames.at(outputIndex) : std::string("output" + std::to_string(outputIndex));
-        HeaderLatest::RuntimeEndPoint output;
-        is.read(reinterpret_cast<char *>(&output), sizeof(output));
+
+        HeaderLatest::RuntimeEndPoint output = ReadEndPoint(is);
        OutputDesc description;
        description.ptrs.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + output.descriptor_offset));
        description.orientation = kDnnInterleavedOrientation;
@ -884,12 +932,17 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
        description.num_bytes_per_element = output.element_size;
        description.scale_factor = output.scaleFactor;

-        auto outputDims = InferenceEngine::SizeVector({modelHeader.nGroup, output.elements_count / modelHeader.nGroup});
+        auto outputDims = InferenceEngine::SizeVector();
+        for (auto i = 0; i < output.shape.NumberOfDimensions; ++i) {
+            outputDims.push_back(output.shape.Dimensions[i]);
+        }
+        InferenceEngine::Layout outputLayout = static_cast<InferenceEngine::Layout>(output.layout);
+        InferenceEngine::Precision outputPresicion =  InferenceEngine::Precision(static_cast<InferenceEngine::Precision::ePrecision>(output.precision));
        dataMap[name] = std::make_shared<InferenceEngine::Data>(name,
                                                 InferenceEngine::TensorDesc(
-                                                         InferenceEngine::Precision::FP32,
+                                                         outputPresicion,
                                                         outputDims,
-                                                         InferenceEngine::Layout::NC));
+                                                         outputLayout));
        desc.at(outputIndex) = description;
    }
 }
--- a/inference-engine/src/gna_plugin/gna_model_serial.hpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp
@ -138,6 +138,8 @@ private:
     */
    static GNAPluginNS::HeaderLatest::ModelHeader ReadHeader(std::istream &is);

+    GNAPluginNS::HeaderLatest::RuntimeEndPoint ReadEndPoint(std::istream &is);
+
    /**
     * @brief Import model from FS into preallocated buffer,
     * buffers for pLayers, and pStructs are allocated here and required manual deallocation using mm_free
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@ -54,12 +54,17 @@
 #include <transformations/common_optimizations/pull_transpose_through_fq.hpp>
 #include <transformations/common_optimizations/relu_fake_quantize_fusion.hpp>
 #include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
+#include <transformations/utils/utils.hpp>

 #include "transformations/remove_extra_reshapes.hpp"
 #include "transformations/insert_transpose_after_convolution_or_pooling.hpp"
 #include "transformations/insert_transpose_before_matmul.hpp"
 #include "transformations/reorder_activation_and_pooling.hpp"
 #include "transformations/swap_input_matmul_gna.hpp"
+#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
+#include "transformations/split_convolution_with_large_buffer_size.hpp"
+
+#include <ngraph/opsets/opset7.hpp>

 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
@ -667,6 +672,15 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        // WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
        manager.register_pass<ngraph::pass::ConvertPriorBox>();
        manager.register_pass<ngraph::pass::CommonOptimizations>();
+        // TODO enable this transformation for networks with convolutions
+        if (!ngraph::op::util::has_op_with_type<ngraph::opset7::Convolution>(graph)) {
+            manager.register_pass<ConvertMatmulWithFqToPointWiseConvolution>();
+            manager.register_pass<ConvertMatmulWithBiasToPointWiseConvolution>();
+            manager.register_pass<ConvertMatmulToPointWiseConvolution>();
+        }
+        manager.register_pass<SplitConvolutionWithFq>();
+        manager.register_pass<SplitConvolutionWithBias>();
+        manager.register_pass<SplitConvolution>();
        manager.register_pass<InsertTransposeBeforeMatmul>();
        manager.register_pass<SwapInputMatMul>();
        manager.register_pass<InsertTransposeAfterConvOrPool>();
@ -735,6 +749,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        passes->registerPass<SubstitutePReluPass>();
        passes->registerPass<SubstituteSoftSignPass>();

+        passes->registerPass<BroadcastConstPass>();
        passes->registerPass<ReorderMaxPoolPass>();
        passes->registerPass<EltwiseSplitOverChannelsPass>();
        passes->registerPass<InsertSplitAligningFilterPass>();
@ -753,7 +768,6 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {

        passes->registerPass<InsertIdentityLayerPass>();
        passes->registerPass<BreakFusingOfOutputLayersPass>();
-        passes->registerPass<BroadcastConstPass>();
        passes->registerPass<InsertDiagonalLayerPass>();
        passes->registerPass<HandleMultipleActivationsForTheLayerPass>();
 #if GNA_LIB_VER == 2
@ -1465,7 +1479,11 @@ static InferenceEngine::Layout GetLayoutForDims(const InferenceEngine::SizeVecto
 Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) {
    // need to have intermediate blob for interleave conversion
    InferenceEngine::Blob::Ptr outputBlob;
-    auto outputDims = outputsDataMap[name]->getTensorDesc().getDims();
+    auto outputDataIt = outputsDataMap.find(name);
+    if (outputDataIt == std::end(outputsDataMap)) {
+        THROW_GNA_EXCEPTION << "Output " << name << " isn't found";
+    }
+    auto outputDims = outputDataIt->second->getTensorDesc().getDims();
    outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, GetLayoutForDims(outputDims)));
    outputBlob->allocate();
    return outputBlob;
@ -1475,7 +1493,11 @@ Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Prec
    InferenceEngine::Blob::Ptr inputBlob;
    // need to have intermediate blob for interleave conversion
    // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
-    auto inputDims = inputsDataMap[name]->getTensorDesc().getDims();
+    auto inputDataIt = inputsDataMap.find(name);
+    if (inputDataIt == std::end(inputsDataMap)) {
+        THROW_GNA_EXCEPTION << "Input " << name << " isn't found";
+    }
+    auto inputDims = inputDataIt->second->getTensorDesc().getDims();
    inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, GetLayoutForDims(inputDims)));
    inputBlob->allocate();
    return inputBlob;
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@ -86,7 +86,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
    });
    IE_ASSERT(inputLayer != nullptr);
    size_t weightsSize = (LayerInfo(prevLayer).has32BOutput() || LayerInfo(inputLayer).isInput()) ?
-                         weightsSize = nextLayer->outData[0]->getDims().back() :
+                         nextLayer->outData[0]->getDims().back() :
                         Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
    std::vector<float> weightsValues(weightsSize, fillValue);
    IE_ASSERT(diagLayer != nullptr);
@ -314,6 +314,7 @@ void HandleMultipleActivationsForTheLayerPass::run() {
                LayerInfo info(inputTo.second);

                if (info.isActivation()) {
+                    if (odata->getDims().empty()) continue;
                    if (!activations.empty() && odata->getDims()[0] != 1) {
                        THROW_GNA_EXCEPTION << "Unsupported batch size " << odata->getDims()[0]
                                            << " for diagonal layer insertion";
@ -741,12 +742,17 @@ void RemovePermutationsNHWCToNCHWPass::run() {
        IE_ASSERT(!input_to.empty());
        auto current_layer = input_to.begin()->second;
        setNHWCOrder(current_layer->input());
-        while (current_layer != pattern_end) {
-            setNHWCOrder(current_layer->outData[0]);
-            input_to = getInputTo(current_layer->outData[0]);
-            IE_ASSERT(!input_to.empty());
-            current_layer = input_to.begin()->second;
-        }
+        std::function<void(CNNLayerPtr)> propogateNHWCOrderRecursive =
+            [pattern_end, &propogateNHWCOrderRecursive, &setNHWCOrder](CNNLayerPtr current_layer) {
+            if (current_layer == pattern_end) return;
+            for (size_t i = 0; i < current_layer->outData.size(); ++i) {
+                setNHWCOrder(current_layer->outData[i]);
+                auto input_to = getInputTo(current_layer->outData[i]);
+                IE_ASSERT(!input_to.empty());
+                propogateNHWCOrderRecursive(input_to.begin()->second);
+            }
+        };
+        propogateNHWCOrderRecursive(current_layer);

        if (LayerInfo(pattern_start).isPermute() && !getInputTo(pattern_start->outData.front()).empty()) {
            auto layer_before_permute = CNNNetPrevLayer(pattern_start);
@ -1447,21 +1453,19 @@ void EltwiseSplitOverChannelsPass::run() {
            THROW_GNA_LAYER_EXCEPTION(l) << "number of outputs expected to be 1";
        }
        auto oData = l->outData.front();
+        auto out_width = GetDataDimSize(oData, DataDimName::W);
        auto totalElementsForOutput = details::product(oData->getDims().begin(), oData->getDims().end());
        auto maxAffineElements = getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo;
        if (totalElementsForOutput <= maxAffineElements) {
            continue;
        }

-        // TODO: for now lets put split of 2 elements as restrictions
        auto totalSplits = 1 + totalElementsForOutput / maxAffineElements;
-        if (totalSplits > 2) {
-            THROW_GNA_LAYER_EXCEPTION(l) << "split layer over output channels on more than 2 layers unsupported";
-        }

        pass_trace() << "transforming " << LAYER_NAME(l) << " by splitting it to multiple eltwise operations\n";
        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);

+        bool sameInputs = l->insData[0].lock() == l->insData[1].lock();
        std::vector<CNNLayerPtr> splitLayers(2);
        for (size_t kThEltwiseInput = 0; kThEltwiseInput != 2; kThEltwiseInput++) {
            // create split layer
@ -1472,31 +1476,38 @@ void EltwiseSplitOverChannelsPass::run() {

            split->insData.push_back(l->insData[kThEltwiseInput]);
            auto inputDesc = l->insData[kThEltwiseInput].lock()->getTensorDesc();
-            // need to split this desc
-            if (inputDesc.getLayout() != Layout::NC) {
-                THROW_GNA_LAYER_EXCEPTION(l)
-                << "cannot split over channel: input " << std::to_string(kThEltwiseInput)
-                << " layout need to be NC";
-            }

            // create split layer outputs
-            for (size_t i = 0;; i++) {
-                auto elements_num = std::min(totalElementsForOutput - i * maxAffineElements,
+            size_t usedElements = 0;
+            for (size_t i = 0; i < totalSplits; i++) {
+                SizeVector newDims;
+                size_t elements_num = std::min(totalElementsForOutput - usedElements,
                        static_cast<size_t>(maxAffineElements));
+                if (inputDesc.getDims().size() == 2) {
+                    newDims = SizeVector{1, elements_num};
+                } else {
+                    elements_num = elements_num - elements_num % out_width;
+                    newDims = SizeVector{1, elements_num / out_width, out_width};
+                }

-                SizeVector newDims = {1, elements_num};
                auto newDesc = TensorDesc(inputDesc.getPrecision(), newDims, inputDesc.getLayout());
                auto data = std::make_shared<Data>(l->name + "/" + std::to_string(kThEltwiseInput) + "/1", newDesc);
                getCreatorLayer(data) = split;
                split->outData.push_back(data);

-                if (elements_num != maxAffineElements) {
+                usedElements += elements_num;
+                if (usedElements == totalElementsForOutput) {
                    break;
                }
            }
            // replacing connection X->eltwise to X->split
            auto oData = CNNLayerFindOutData(l, kThEltwiseInput);
            oData.second->second = split;
+
+            if (sameInputs) {
+                splitLayers[1] = splitLayers[0];
+                break;
+            }
        }

        // create concatlayer
@ -1507,8 +1518,6 @@ void EltwiseSplitOverChannelsPass::run() {
        concat->outData.push_back(masterEltwise->outData.front());
        getCreatorLayer(masterEltwise->outData.front()) = concat;

-
-        // create new eltwise layers - here 2 hardcode
        for (size_t k = 0; k != totalSplits; k++) {
            auto eltwiseRaw = std::make_shared<EltwiseLayer>(
                    LayerParams{l->name + "/eltwise/" + std::to_string(k), "Eltwise", Precision::FP32});
@ -1517,7 +1526,6 @@ void EltwiseSplitOverChannelsPass::run() {
            eltwiseRaw->coeff = masterEltwise->coeff;
            auto eltwise = quantized ? InferenceEngine::injectData<QuantizedLayerParams>(eltwiseRaw) : eltwiseRaw;

-
            eltwise->insData.push_back(splitLayers[0]->outData[k]);
            eltwise->insData.push_back(splitLayers[1]->outData[k]);
            getInputTo(splitLayers[0]->outData[k])[eltwise->name] = eltwise;
@ -1529,6 +1537,15 @@ void EltwiseSplitOverChannelsPass::run() {
            auto data = std::make_shared<Data>(l->name + "/elwise/out/" + std::to_string(k), newDesc);
            getCreatorLayer(data) = eltwise;
            eltwise->outData.push_back(data);
+            if (quantized) {
+                auto eltwiseQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(eltwise);
+                if (quantized->_src_quant.IsStatsSet()) {
+                    eltwiseQuant->_src_quant.CopyStats(quantized->_src_quant);
+                }
+                if (quantized->_dst_quant.IsStatsSet()) {
+                    eltwiseQuant->_dst_quant.CopyStats(quantized->_dst_quant);
+                }
+            }
            getInputTo(data)[concat->name] = concat;
            concat->insData.push_back(data);
        }
@ -1919,13 +1936,20 @@ void FuseFQIntoWeightsPass::run() {
        }

        GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
-        size_t layers_connected_to_fq_count = getInputTo(fqLayer->outData[0]).size();
+        auto inputTo = getInputTo(fqLayer->outData[0]);
+        size_t layers_connected_to_fq_count = inputTo.size();
+        auto layerBeforeWeightable = fqLayer;
+        while (layers_connected_to_fq_count == 1 && LayerInfo(inputTo.begin()->second).isNonFunctional()) {
+            layerBeforeWeightable = inputTo.begin()->second;
+            inputTo = getInputTo(layerBeforeWeightable->outData[0]);
+            layers_connected_to_fq_count = inputTo.size();
+        }
        for (int index = 0; index < layers_connected_to_fq_count; index++) {
-            auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, index, isNonFunctional).first;
+            auto weightableLayer = CNNNetGetNextLayerSkipCertain(layerBeforeWeightable, 0, index, isNonFunctional).first;
            if (!LayerInfo(weightableLayer).isWeightable()) {
                continue;
            }
-            if (weightableLayer->insData.size() != 3) {
+            if (weightableLayer->insData.size() < 2) {
                continue;
            }

@ -1942,7 +1966,8 @@ void FuseFQIntoWeightsPass::run() {
            pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
                << LAYER_NAME(weightableLayer) << "\n";

-            auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx);
+            auto biases = weightableLayer->insData.size() == 3 ?
+                LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx) : nullptr;
            auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();

            // 1. broke existing connections - by detaching fq subgraph from rest of graph
@ -2149,8 +2174,11 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        }
        GNAFakeQuantizeLayer fqLayer(l);
        auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
-        if (prevLayer->outData.size() != 1) {
-            THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
+        auto prevDataIt = std::find_if(std::begin(prevLayer->outData), std::end(prevLayer->outData), [l](DataPtr data) {
+            return getInputTo(data).find(l->name) != std::end(getInputTo(data));
+        });
+        if (prevDataIt == std::end(prevLayer->outData)) {
+            THROW_GNA_LAYER_EXCEPTION(fqLayer) << "Invalid connection between " << prevLayer->name << " and " << l->name;
        }

        auto inputRange = fqLayer.getInputRange();
@ -2181,8 +2209,18 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        quantParamsPrevLayer->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
        quantParamsPrevLayer->_dst_quant.SetMaxValues({ outputRange.second[0] }, false);

+        // Propogate destination statistics to multiply layer if it's set for the next sum/sub layer (is considered as bias)
+        if (LayerInfo(prevLayer).isEltwiseSum() || LayerInfo(prevLayer).isEltwiseSub()) {
+            auto eltwPrevLayer = CNNNetPrevLayerSkipCertain(prevLayer, 0, donotSkip);
+            auto constLayer = CNNNetPrevLayerSkipCertain(prevLayer, 1, donotSkip);
+            if (LayerInfo(eltwPrevLayer).isEltwise() && LayerInfo(constLayer).isConst()) {
+                auto quantParamsEltwLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(eltwPrevLayer);
+                quantParamsEltwLayer->_dst_quant.CopyStats(quantParamsPrevLayer->_dst_quant);
+            }
+        }
+
        auto fqQauntParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
-        fqQauntParams->_dst_quant.SetLevels(fqLevels);
+        fqQauntParams->_dst_quant.SetLevels(UINT16_MAX);
        fqQauntParams->_dst_quant.SetMinValues({ inputRange.first[0] }, true);
        fqQauntParams->_dst_quant.SetMaxValues({ inputRange.second[0] }, true);
        fqQauntParams->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
@ -2198,7 +2236,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        // FQ Layer is fused only when previous layer is const, memory or activation layer
        // or a next layer is activation layer.
        bool isFQFuseAllowed = allowFQFuse(l);
-        auto prevData = prevLayer->outData.front();
+        auto prevData = *prevDataIt;

        // Find all output layers connected to FQ
        auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
@ -2207,7 +2245,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        }

        if (isFQFuseAllowed) {
-            getInputTo(prevLayer->outData.front()).clear();
+            getInputTo(prevData).clear();
        }

        // Connect all next layers after FQ to the layer that is before FQ
@ -2222,7 +2260,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
                for (int insDataIdx : insDatas) {
                    nextLayers[i]->insData[insDataIdx] = prevData;
                }
-                getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
+                getInputTo(prevData)[nextLayers[i]->name] = nextLayers[i];
            }

            propagateStatistics(quantParamsPrevLayer, nextLayers[i]);
--- a/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp
@ -0,0 +1,197 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include "backend/dnn_types.h"
+#include "serial/headers/2dot4/gna_model_header.hpp"
+#include "serial/headers/2dot6/gna_model_header.hpp"
+#include "serial/headers/latest/gna_model_header.hpp"
+#include "gna_data_types.hpp"
+
+#pragma pack(push, 1)
+
+namespace GNAPluginNS {
+namespace Header2dot7 {
+
+/**
+ Maximal number of supported shape dimensions.
+ */
+#define GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS 8
+
+/**
+ * @brief Header version 2.7
+ */
+struct ModelHeader {
+    /**
+     *@brief MagicNumber – GNAM in ascii table, equals to hex 0x474e414d
+     */
+    char gnam[4] = {};
+    /**
+     * @brief if header size is not equal to sizeof ModelHeader - some reserved data append in the end of header
+     * usually it is an indicator of working with version of model different that is current export function produce
+     */
+    uint32_t headerSize = 0u;
+    struct Version {
+        /**
+         * @details Version of format Major – unsigned int, ex: 0x0001
+         * every change in the header or in the layers definition should be reflected in version change
+         * for backward compatibility new parsers can read old versions of model with certain restrictions
+         */
+        uint16_t major = 2u;
+        /**
+         * @details Version of Format Minor – unsigned int,  corresponding to build revision for example
+         * changes in minor version are not affected layout of model
+         */
+        uint32_t minor = 7u;
+    } version;
+    /**
+     * @brief Memory required to be allocated using GNAAlloc()
+     */
+    uint64_t gnaMemSize = 0ull;
+    /**
+     * @brief Number of GNA Layers
+     */
+    uint64_t layersCount = 0ull;
+    /**
+     * @brief Grouping level
+     * This is depricted field and used for old models only (<=2.6)
+     */
+    uint32_t nGroup = 0u;
+
+    /**
+     * Convolution related setting - they are affecting input transformation
+     */
+    uint32_t nRotateRows = 0u;
+    uint32_t nRotateColumns = 0u;
+    bool doRotateInput = false;
+
+    uint32_t nInputs = 0u;
+    uint32_t nOutputs = 0u;
+
+    /**
+     * Convolution related setting - they are affecting output transformation
+     */
+    uint32_t nRotateOutputRows = 0u;
+    uint32_t nRotateOutputColumns = 0u;
+    bool doRotateOutput = false;
+
+    uint32_t nTransposeInputs = 0u;
+    uint32_t nTransposeOutputs = 0u;
+
+    /**
+     * Reserved Data might be here
+     */
+    ModelHeader() = default;
+    ModelHeader(GNAPluginNS::Header2dot1::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        version.minor = old.version.minor;
+    }
+    ModelHeader(GNAPluginNS::Header2dot4::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        nRotateOutputRows = old.nRotateOutputRows;
+        nRotateOutputColumns = old.nRotateOutputColumns;
+        doRotateOutput = old.doRotateOutput;
+        version.minor = old.version.minor;
+    }
+};
+#pragma pack(pop)
+
+/*
+ * In runtime endpoint mostly same as in serial version, except of descriptor field
+ */
+struct RuntimeEndPoint {
+    /**
+     * if scale factor is different then pased into infer , network might need to be requantized
+     */
+    float scaleFactor = 0;
+    /**
+     * Pointer descriptor
+     */
+    void* descriptor_ptr = nullptr;
+    /**
+     * Endpoint resolution in bytes.
+     */
+    uint32_t element_size = 0;
+    /**
+     * Number of elements
+     */
+    uint32_t elements_count = 0;
+    /**
+     * Offset in bytes of pointer descriptor
+    */
+    uint64_t descriptor_offset = 0ull;
+    /**
+     Shape specifying dimension values.
+    */
+    struct Shape {
+        /**
+         Number of dimensions or rank or order.
+        */
+        uint32_t NumberOfDimensions = 0;
+        /**
+         array specifying value of each dimension.
+        Set all zeros for scalars.
+        */
+        uint32_t Dimensions[GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS] = {0};
+    } shape;
+    /**
+     * Blob layout
+     */
+    uint8_t layout = InferenceEngine::Layout::NC;
+    /**
+     * Blob precision
+     */
+    uint8_t precision = InferenceEngine::Precision::FP32;
+
+    intel_dnn_orientation_t orientation = kDnnUnknownOrientation;
+
+    RuntimeEndPoint() = default;
+    RuntimeEndPoint(const GNAPluginNS::Header2dot6::RuntimeEndPoint &old, uint32_t ngroup) {
+        scaleFactor = old.scaleFactor;
+        descriptor_ptr = old.descriptor_ptr;
+        element_size = old.element_size;
+        elements_count = old.elements_count;
+        orientation = old.orientation;
+        layout = InferenceEngine::Layout::NC;
+        precision = InferenceEngine::Precision::FP32;
+        descriptor_offset = old.descriptor_offset;
+        InferenceEngine::SizeVector dims = {ngroup, elements_count / ngroup};
+        shape.NumberOfDimensions = static_cast<uint32_t>(dims.size());
+        for (auto i = 0; i < dims.size(); i++) {
+            shape.Dimensions[i] = dims[i];
+        }
+    }
+    RuntimeEndPoint(double scaleFactor,
+                    void* descriptor_ptr,
+                    uint32_t element_size,
+                    uint32_t elements_count,
+                    Shape shape,
+                    uint8_t layout,
+                    uint8_t precision,
+                    intel_dnn_orientation_t orientation) : scaleFactor(scaleFactor),
+                                                           descriptor_ptr(descriptor_ptr),
+                                                           element_size(element_size),
+                                                           elements_count(elements_count),
+                                                           shape(shape),
+                                                           layout(layout),
+                                                           precision(precision),
+                                                           orientation(orientation) { }
+};
+} // namespace Header2dot7
+} // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
@ -4,11 +4,11 @@

 #pragma once

-#include "serial/headers/2dot6/gna_model_header.hpp"
+#include "serial/headers/2dot7/gna_model_header.hpp"

 namespace GNAPluginNS {
 namespace HeaderLatest {
-using ModelHeader = GNAPluginNS::Header2dot6::ModelHeader;
-using RuntimeEndPoint = GNAPluginNS::Header2dot6::RuntimeEndPoint;
+using ModelHeader = GNAPluginNS::Header2dot7::ModelHeader;
+using RuntimeEndPoint = GNAPluginNS::Header2dot7::RuntimeEndPoint;
 }
 }
--- a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp
+++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp
@ -0,0 +1,180 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "layers/gna_permute.hpp"
+#include "backend/gna_limitations.hpp"
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(ConvertMatmulToPointWiseConvolution, "ConvertMatmulToPointWiseConvolution", 0);
+NGRAPH_RTTI_DEFINITION(ConvertMatmulWithBiasToPointWiseConvolution, "ConvertMatmulWithBiasToPointWiseConvolution", 0);
+NGRAPH_RTTI_DEFINITION(ConvertMatmulWithFqToPointWiseConvolution, "ConvertMatmulWithFqToPointWiseConvolution", 0);
+
+static std::tuple<bool, uint32_t, uint32_t, uint32_t> VerifyAndGetConvParams(std::shared_ptr<ngraph::Node> matmul_node) {
+    auto input1_shape = matmul_node->get_input_shape(0);
+    auto input2_shape = matmul_node->get_input_shape(1);
+    auto output_shape = matmul_node->get_output_shape(0);
+    if (input1_shape.size() == 3 && input1_shape.front() == 1) {
+        input1_shape.erase(std::begin(input1_shape));
+    }
+
+    if (input1_shape.size() != 2 || input2_shape.size() != 2 || output_shape.size() < 2) {
+        return std::make_tuple(false, 0, 0, 0);
+    }
+
+    // Check if MatMul or corresponding pointwise convolution are supported by GNA
+    const uint32_t width = input1_shape.front();
+    const uint32_t in_channels = input2_shape.back();
+    const uint32_t out_channels = input2_shape.front();
+    if (input1_shape.front() <= GNALimitations::affineMaxBatchSize ||
+        out_channels % GNALimitations::convFiltersNumDivider != 0 ||
+        out_channels > GNALimitations::convMaxFiltersNum ||
+        in_channels > GNALimitations::convFilterMaxSize) {
+        return std::make_tuple(false, 0, 0, 0);
+    }
+
+    return std::make_tuple(true, width, in_channels, out_channels);
+}
+
+static bool Convert(std::shared_ptr<ngraph::Node> matmul_node,
+                    std::shared_ptr<ngraph::Node> add,
+                    std::shared_ptr<ngraph::Node> bias,
+                    std::shared_ptr<ngraph::Node> fq) {
+    bool supported;
+    uint32_t width, in_channels, out_channels;
+    std::tie(supported, width, in_channels, out_channels) = VerifyAndGetConvParams(matmul_node);
+    if (!supported) return false;
+
+    auto input_node = matmul_node->input_value(0).get_node_shared_ptr();
+    auto weights_node = matmul_node->input_value(1).get_node_shared_ptr();
+    auto base_name = matmul_node->get_friendly_name();
+
+    auto reshape_const_before = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+                                                                            ngraph::Shape{4},
+                                                                            ngraph::Shape{1, 1, width, in_channels});
+    auto reshape_before =  std::make_shared<ngraph::opset7::Reshape>(input_node, reshape_const_before, false);
+    reshape_before->set_friendly_name(base_name + "/reshape_in");
+
+    auto transpose_before = std::make_shared<ngraph::opset7::Transpose>(reshape_before,
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4},
+        GetPermuteOrder(InferenceEngine::Layout::NHWC, InferenceEngine::Layout::NCHW)));
+    transpose_before->set_friendly_name(base_name + "/transpose_in");
+
+    auto weights_reshape_const = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+        ngraph::Shape{4}, ngraph::Shape{out_channels, in_channels, 1, 1});
+    auto weights_reshaped =  std::make_shared<ngraph::opset7::Reshape>(weights_node, weights_reshape_const, false);
+
+    std::shared_ptr<ngraph::Node> conv_node = std::make_shared<ngraph::opset7::Convolution>(transpose_before, weights_reshaped,
+            ngraph::Strides{1, 1}, ngraph::CoordinateDiff{0, 0}, ngraph::CoordinateDiff{0, 0},
+            ngraph::Strides{1, 1}, ngraph::op::PadType::VALID);
+    conv_node->set_friendly_name(base_name + "/conv");
+
+    std::shared_ptr<ngraph::Node> root_node = matmul_node;
+    if (bias != nullptr) {
+         conv_node = std::make_shared<ngraph::opset7::Add>(conv_node, bias);
+         root_node = add;
+    }
+
+    if (fq != nullptr) {
+        conv_node = fq->clone_with_new_inputs({conv_node, fq->input_value(1), fq->input_value(2),
+            fq->input_value(3), fq->input_value(4)});
+        root_node = fq;
+    }
+
+    auto transpose_after = std::make_shared<ngraph::opset7::Transpose>(conv_node,
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4},
+        GetPermuteOrder(InferenceEngine::Layout::NCHW, InferenceEngine::Layout::NHWC)));
+    transpose_after->set_friendly_name(base_name + "/transpose_out");
+
+    auto output_shape = matmul_node->get_output_shape(0);
+    output_shape[output_shape.size() - 1] = out_channels;
+    output_shape[output_shape.size() - 2] = width;
+    auto reshape_const_after = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+                                                                            ngraph::Shape{output_shape.size()},
+                                                                            output_shape);
+    auto reshape_after =  std::make_shared<ngraph::opset7::Reshape>(transpose_after, reshape_const_after, false);
+    reshape_after->set_friendly_name(base_name);
+
+    ngraph::replace_node(root_node, reshape_after);
+    return true;
+}
+
+ConvertMatmulToPointWiseConvolution::ConvertMatmulToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), nullptr, nullptr, nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(matmul, "ConvertMatmulToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
+
+ConvertMatmulWithBiasToPointWiseConvolution::ConvertMatmulWithBiasToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({matmul, bias});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(),
+            pattern_map.at(bias).get_node_shared_ptr(), nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(add, "ConvertMatmulWithBiasToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
+
+ConvertMatmulWithFqToPointWiseConvolution::ConvertMatmulWithFqToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({matmul, bias});
+    auto matmul_out = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{add, matmul});
+    auto out_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({matmul_out,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto add_it = pattern_map.find(add);
+        auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr());
+        auto bias_it = pattern_map.find(bias);
+        auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr());
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), add_node, bias_node,
+             pattern_map.at(out_fq).get_node_shared_ptr());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(out_fq, "ConvertMatmulWithFqToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
--- a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp
+++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp
@ -0,0 +1,71 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *                                                  |
+ *                                      Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulToPointWiseConvolution();
+};
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it, moved add with bias before the last transpose:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *       |                                         |
+ *      Add (const)                            Add (const)
+ *                                                 |
+ *                                      Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulWithBiasToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulWithBiasToPointWiseConvolution();
+};
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it, moved add with bias and/or fake quantize before the last transpose:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *       |                                         |
+ *      Add (const)                            Add (const)
+ *       |                                         |
+ *     FakeQuantize                            FakeQuantize
+ *                                                 |
+ *                                         Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulWithFqToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulWithFqToPointWiseConvolution();
+};
+
+} // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
@ -0,0 +1,131 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/split_convolution_with_large_buffer_size.hpp"
+
+#include <numeric>
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "backend/gna_limitations.hpp"
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(SplitConvolution, "SplitConvolution", 0);
+NGRAPH_RTTI_DEFINITION(SplitConvolutionWithBias, "SplitConvolutionWithBias", 0);
+NGRAPH_RTTI_DEFINITION(SplitConvolutionWithFq, "SplitConvolutionWithFq", 0);
+
+static std::vector<int64_t> GetConvSplitSizes(std::shared_ptr<ngraph::Node> conv) {
+    uint32_t width = conv->get_input_shape(0).back();
+    uint32_t in_channels = conv->get_input_shape(0).at(1);
+    uint32_t usedWidth = 0;
+    std::vector<int64_t> split_sizes;
+    uint32_t width_max_size = GNALimitations::bufferMaxSize / in_channels;
+    width_max_size = width_max_size - width_max_size % 64;
+    while (usedWidth < width) {
+        uint32_t width_part = std::min(width - usedWidth, width_max_size);
+        split_sizes.push_back(width_part);
+        usedWidth += width_part;
+    }
+    IE_ASSERT(usedWidth == width);
+    return split_sizes;
+}
+
+static bool Convert(std::shared_ptr<ngraph::Node> conv,
+                    std::shared_ptr<ngraph::Node> add,
+                    std::shared_ptr<ngraph::Node> bias,
+                    std::shared_ptr<ngraph::Node> fq) {
+    auto input_size = std::accumulate(std::begin(conv->get_input_shape(0)),
+        std::end(conv->get_input_shape(0)), 1, std::multiplies<size_t>());
+    if (input_size <= GNALimitations::bufferMaxSize) {
+        return false;
+    }
+
+    auto split_sizes = GetConvSplitSizes(conv);
+    IE_ASSERT(split_sizes.size() > 1);
+
+    /* TODO check if it's NHWC convolution wrapped with transposes or all input dimensions except of width == 1,
+        otherwise this split axis isn't supported */
+    const int64_t width_axis = conv->get_input_shape(0).size() - 1;
+    auto split_node = std::make_shared<ngraph::opset7::VariadicSplit>(conv->input_value(0),
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{width_axis}),
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_sizes.size()}), split_sizes));
+    split_node->set_friendly_name(conv->get_friendly_name() + "/split");
+    ngraph::OutputVector convOutputs;
+    std::shared_ptr<ngraph::Node> root_node = fq ? fq : (add ? add : conv);
+    for (int i = 0; i < split_sizes.size(); ++i) {
+        std::shared_ptr<ngraph::Node> output = conv->clone_with_new_inputs({split_node->output(i), conv->input_value(1)});
+        output->set_friendly_name(conv->get_friendly_name() + "_" + std::to_string(i));
+        if (bias) {
+            output = std::make_shared<ngraph::opset7::Add>(output, bias);
+        }
+
+        if (fq) {
+            output = fq->clone_with_new_inputs({output, fq->input_value(1), fq->input_value(2),
+                fq->input_value(3), fq->input_value(4)});
+        }
+        convOutputs.push_back(output);
+    }
+
+    auto concat = std::make_shared<ngraph::opset7::Concat>(convOutputs, width_axis);
+    concat->set_friendly_name(conv->get_friendly_name());
+    ngraph::replace_node(root_node, concat);
+    return true;
+}
+
+SplitConvolution::SplitConvolution() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), nullptr, nullptr, nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(conv, "SplitConvolution");
+    this->register_matcher(m, callback);
+}
+
+SplitConvolutionWithBias::SplitConvolutionWithBias() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({conv, bias});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(),
+            pattern_map.at(bias).get_node_shared_ptr(), nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(add, "SplitConvolutionWithBias");
+    this->register_matcher(m, callback);
+}
+
+SplitConvolutionWithFq::SplitConvolutionWithFq() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({conv, bias});
+    auto conv_output = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{conv, add});
+    auto out_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({conv_output,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto add_it = pattern_map.find(add);
+        auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr());
+        auto bias_it = pattern_map.find(bias);
+        auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr());
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), add_node, bias_node, pattern_map.at(out_fq).get_node_shared_ptr());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(out_fq, "SplitConvolutionWithFq");
+    this->register_matcher(m, callback);
+}
--- a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp
@ -0,0 +1,34 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+// @brief Splits convolution with large input buffer
+class SplitConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolution();
+};
+
+// @brief Splits convolution with large input buffer, move add with bias to each convolution before concat
+class SplitConvolutionWithBias : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolutionWithBias();
+};
+
+/* @brief Splits convolution with large input buffer,
+ * move add with bias and/or fake quantize to each convolution before concat
+ */
+class SplitConvolutionWithFq : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolutionWithFq();
+};
+
+} // namespace GNAPluginNS
--- a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
@ -312,6 +312,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
    struct Subgraph {
        ngraph::ResultVector    _results;
        ngraph::ParameterVector _parameters;
+        ngraph::SinkVector      _sinks;
        std::string             _affinity;
    };
    std::unordered_map<int, Subgraph> subgraphs;
@ -325,6 +326,9 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
        } else if (ngraph::op::is_parameter(node)) {
            subgraph._parameters.emplace_back(
                std::dynamic_pointer_cast<ngraph::op::v0::Parameter>(node->shared_from_this()));
+        } else if (ngraph::op::is_sink(node)) {
+            subgraph._sinks.emplace_back(
+                    std::dynamic_pointer_cast<ngraph::op::Sink>(node->shared_from_this()));
        }
        auto itAffinity = affinities.find(node);
        if (itAffinity != affinities.end()) {
@ -373,7 +377,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
    for (auto&& subgraph : orderedSubgraphs) {
        _networks[id]._device = subgraph._affinity;
        subFunctions[id] =
-            std::make_shared<ngraph::Function>(subgraph._results, subgraph._parameters,
+            std::make_shared<ngraph::Function>(subgraph._results, subgraph._sinks, subgraph._parameters,
                                                     _name + '_' + std::to_string(id));
        _networks[id]._clonedNetwork = CNNNetwork{subFunctions[id]};
        // update of pre-processing info
--- a/inference-engine/src/transformations/include/transformations/op_conversions/gather_normalize_negative_indices.hpp
+++ b/inference-engine/src/transformations/include/transformations/op_conversions/gather_normalize_negative_indices.hpp
@ -0,0 +1,29 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <transformations_visibility.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+    class TRANSFORMATIONS_API GatherNegativeConstIndicesNormalize;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief GatherNegativeConstIndicesNormalize checks if indices value is negative scalar and
+ * normalizes it using ShapeOf->Add->Cast subgraph.
+ * We need to remove this transformation after adding support of negative indices in
+ * future version of Gather operation.
+ */
+class ngraph::pass::GatherNegativeConstIndicesNormalize : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    GatherNegativeConstIndicesNormalize();
+};
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
@ -70,6 +70,7 @@
 #include "transformations/op_conversions/log_softmax_decomposition.hpp"
 #include "transformations/op_conversions/mvn6_decomposition.hpp"
 #include "transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp"
+#include "transformations/op_conversions/gather_normalize_negative_indices.hpp"

 #include <ngraph/pass/manager.hpp>
 #include <ngraph/pass/constant_folding.hpp>
@ -157,6 +158,7 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
    decomp->add_matcher<ngraph::pass::MVN6Decomposition>();
    decomp->add_matcher<ngraph::pass::SimplifyCTCGreedyDecoderSeqLen>();
    decomp->add_matcher<ngraph::pass::EinsumDecomposition>();
+    decomp->add_matcher<ngraph::pass::GatherNegativeConstIndicesNormalize>();
    decomp->set_name("ngraph::pass::CommonDecompositions");

    // CF is required after all decompositions
--- a/inference-engine/src/transformations/src/transformations/op_conversions/gather_normalize_negative_indices.cpp
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/gather_normalize_negative_indices.cpp
@ -0,0 +1,77 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/op_conversions/gather_normalize_negative_indices.hpp"
+
+#include <memory>
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include "itt.hpp"
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::GatherNegativeConstIndicesNormalize, "GatherNegativeConstIndicesNormalize", 0);
+
+ngraph::pass::GatherNegativeConstIndicesNormalize::GatherNegativeConstIndicesNormalize() {
+    MATCHER_SCOPE(GatherNegativeConstIndicesNormalize);
+    auto data_input = ngraph::pattern::any_input(pattern::has_static_rank());
+    auto axis_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto indices_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto gather_node = std::make_shared<ngraph::opset7::Gather>(data_input, indices_input, axis_input);
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        auto& pattern_to_output = m.get_pattern_value_map();
+        auto gather = std::dynamic_pointer_cast<ngraph::opset7::Gather>(pattern_to_output.at(gather_node).get_node_shared_ptr());
+        auto data = pattern_to_output.at(data_input);
+        auto axis_constant = std::dynamic_pointer_cast<ngraph::opset7::Constant>(pattern_to_output.at(axis_input).get_node_shared_ptr());
+        auto indices_constant = std::dynamic_pointer_cast<ngraph::opset7::Constant>(pattern_to_output.at(indices_input).get_node_shared_ptr());
+
+        if (!gather || !axis_constant || !indices_constant) {
+            return false;
+        }
+
+        auto indices = indices_constant->cast_vector<int64_t>();
+        if (indices.size() != 1 || indices[0] >= 0) {
+            return false;
+        }
+
+        auto axis = axis_constant->cast_vector<int64_t>();
+        if (axis.size() != 1) {
+            return false;
+        }
+
+        auto axis_value = axis[0];
+
+        // normalize `axis` value if it is negative
+        if (axis_value < 0) {
+            axis_value = axis_value + data.get_partial_shape().rank().get_length();
+        }
+
+        if (data.get_partial_shape().rank().get_length() < axis_value) {
+            return false;
+        }
+
+        // check `axis` dimension of data tensor is static
+        if (!data.get_partial_shape()[axis_value].is_static()) {
+            return false;
+        }
+
+        auto input_type = indices_constant->get_element_type();
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, input_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(input_type, Shape{}, {axis_value}), ngraph::opset7::Constant::create(input_type, Shape{}, {0}));
+
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices_constant);
+        auto gather_new = gather_node->copy_with_new_inputs({data, add, axis_constant});
+        gather_new->set_friendly_name(gather->get_friendly_name());
+
+        ngraph::copy_runtime_info(gather, {shape_of, input_gather, add, gather_new});
+        ngraph::replace_node(gather, gather_new);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(gather_node, matcher_name);
+    register_matcher(m, callback);
+}
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp
@ -92,7 +92,7 @@ bool checkGrowingOutput(const Model& model) {
        return false;
    }

-    static const float SCALE_THRESHOLD = 0.125f;
+    static const float SCALE_THRESHOLD = 0.1f;

    for (const auto& stage : model->getStages()) {
        if (stage->type() != StageType::Power &&
@ -248,14 +248,13 @@ void PassImpl::run(const Model& model) {
                if (firstStage && shift < 4 && isGrowingOutput && weights->desc().dim(Dim::C) > 1) {
                    normalVal = 5;
                }
-
                shift = correctShift(shift, firstStage, stage->origLayer()->type);
                shift -= normalVal;
            }

            firstStage = false;
            scale = 1;
-            if (shift > scaleThreshold) {
+            if (shift >= scaleThreshold) {
                scale = static_cast<float>(1ULL << static_cast<std::uint32_t>(shift));
            }

--- a/inference-engine/tests/functional/inference_engine/transformations/gather_normalize_negative_indices_test.cpp
+++ b/inference-engine/tests/functional/inference_engine/transformations/gather_normalize_negative_indices_test.cpp
@ -0,0 +1,306 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/op_conversions/gather_normalize_negative_indices.hpp>
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_neg_axis) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+             ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_dif_input_types) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, {1});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_static_axis_dim) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_static_axis_dim_neg_axis) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_non_static_axis_dim) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, DYN, DYN});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, DYN, DYN});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_positive_ind) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{2, 3});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{2, 3});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_non_static_rank) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape::dynamic(ngraph::Rank::dynamic()));
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape::dynamic());
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/memory.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/memory.cpp
@ -38,7 +38,7 @@ INSTANTIATE_TEST_CASE_P(smoke_MemoryTest, MemoryTest,
                ::testing::ValuesIn(iterationCount),
                ::testing::ValuesIn(inShapes),
                ::testing::ValuesIn(inputPrecisions),
-                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU, "HETERO:CPU")),
        MemoryTest::getTestCaseName);

 } // namespace
--- a/inference-engine/tests/functional/plugin/gna/pass_tests/convert_matmul_to_pointwise_conv.cpp
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/convert_matmul_to_pointwise_conv.cpp
@ -0,0 +1,230 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+#include "ngraph_functions/pass/convert_prc.hpp"
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::vector<size_t>                 // Input Shape
+> convertMatmulToPointwiseConvParams;
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::vector<size_t>,                // Input Shape
+    std::pair<float, float>             // Input Min and Max
+> convertMatmulToPointwiseConvWithFqParams;
+
+namespace LayerTestsDefinitions {
+
+class ConvertMatmulToPointwiseConv : public testing::WithParamInterface<convertMatmulToPointwiseConvParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<convertMatmulToPointwiseConvParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
+        return result.str();
+    }
+
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const {
+        InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+        blob->allocate();
+
+        auto* rawBlobDataPtr = blob->buffer().as<float*>();
+        std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -0.2f, 0.2f);
+        for (size_t i = 0; i < blob->size(); i++) {
+            rawBlobDataPtr[i] = values[i];
+        }
+        return blob;
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = this->GetParam();
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+        size_t batch = inputShape[inputShape.size() - 2];
+        size_t elemNum = inputShape[inputShape.size() - 1];
+        std::vector<float> weights = CommonTestUtils::generate_float_numbers(elemNum * elemNum, -0.1f, 0.1f);
+        auto weightsNode = std::make_shared<ngraph::opset7::Constant>(ngPrc, ngraph::Shape{elemNum, elemNum}, weights);
+        auto matmul = ngraph::builder::makeMatMul(params[0], weightsNode, false, true);
+
+        auto bias = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{1, batch, 1}, std::vector<float>{1.0f});
+        auto add = ngraph::builder::makeEltwise(matmul, bias, ngraph::helpers::EltwiseTypes::ADD);
+
+        auto pattern = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+            ngraph::Shape{ inputShape.size() }, inputShape);
+        auto reshape = std::make_shared<ngraph::opset7::Reshape>(matmul, pattern, false);
+        auto relu = std::make_shared<ngraph::opset7::Relu>(reshape);
+
+        ngraph::ResultVector results{ std::make_shared<ngraph::opset7::Result>(relu)};
+        function = std::make_shared<ngraph::Function>(results, params, "ConvertMatmulToPointwiseConv");
+    }
+};
+
+class ConvertMatmulToPointwiseConvWithFq : public testing::WithParamInterface<convertMatmulToPointwiseConvWithFqParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+    float inputDataMin = -10.0f;
+    float inputDataMax = 10.0f;
+    float inputDataResolution = 1.0f;
+
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<convertMatmulToPointwiseConvWithFqParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::vector<size_t> inputShape;
+        std::pair<float, float> inputMinMax;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
+        result << "_inputMinMax=(" << inputMinMax.first << ".." << inputMinMax.second << ")";
+        return result.str();
+    }
+
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const {
+        return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin,
+            1 / inputDataResolution);
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+        std::vector<size_t> inputShape;
+        std::pair<float, float> inputMinMax;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax) = this->GetParam();
+        std::tie(inputDataMin, inputDataMax) = inputMinMax;
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+        auto inputLowNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ inputDataMin });
+        auto inputHighNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ inputDataMax });
+        auto inputFQ = std::make_shared<ngraph::opset7::FakeQuantize>(params[0],
+            inputLowNode, inputHighNode, inputLowNode, inputHighNode, UINT16_MAX);
+
+        size_t elemNum = inputShape[inputShape.size() - 1];
+
+        const float weightsMin = -0.2f;
+        const float weightsMax = 0.2f;
+        std::vector<float> weights = CommonTestUtils::generate_float_numbers(elemNum * elemNum, weightsMin, weightsMax);
+        auto weightsNode = std::make_shared<ngraph::opset7::Constant>(ngPrc, ngraph::Shape{elemNum, elemNum}, weights);
+        auto weightsLowNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ weightsMin });
+        auto weightsHighNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+              std::vector<float>{ weightsMax });
+        auto weightsFQNode = std::make_shared<ngraph::opset7::FakeQuantize>(weightsNode,
+            weightsLowNode, weightsHighNode, weightsLowNode, weightsHighNode, UINT16_MAX);
+        auto matmul = ngraph::builder::makeMatMul(inputFQ, weightsFQNode, false, true);
+
+        auto bias = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{1, 1, 1}, std::vector<float>{1.0f});
+        auto add = ngraph::builder::makeEltwise(matmul, bias, ngraph::helpers::EltwiseTypes::ADD);
+
+        auto outputLowNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ -inputDataMax * weightsMax *  elemNum });
+        auto outputHighNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ inputDataMax * weightsMax * elemNum });
+        auto outputFQ = std::make_shared<ngraph::opset7::FakeQuantize>(add,
+            outputLowNode, outputHighNode, outputLowNode, outputHighNode, UINT16_MAX);
+
+        auto pattern = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+            ngraph::Shape{ inputShape.size() }, inputShape);
+        auto reshape = std::make_shared<ngraph::opset7::Reshape>(outputFQ, pattern, false);
+
+        auto relu = std::make_shared<ngraph::opset7::Relu>(reshape);
+
+        ngraph::ResultVector results{ std::make_shared<ngraph::opset7::Result>(relu)};
+        function = std::make_shared<ngraph::Function>(results, params, "ConvertMatmulToPointwiseConv");
+    }
+};
+
+TEST_P(ConvertMatmulToPointwiseConv, CompareWithRefImpl) {
+    Run();
+};
+
+TEST_P(ConvertMatmulToPointwiseConvWithFq, CompareWithRefImpl) {
+    Run();
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
+    }
+};
+
+const std::vector<std::vector<size_t>> inputShape = {
+    {1, 64, 64},
+    {1, 256, 128},
+    {1, 512, 128}
+};
+
+const std::vector<std::pair<float, float>> fqStats = {
+    {-0.5, 0.5}
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_ConvertMatmulToPointwiseConvTest, ConvertMatmulToPointwiseConv,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(inputShape)),
+    ConvertMatmulToPointwiseConv::getTestCaseName);
+
+// Issue 55662
+INSTANTIATE_TEST_CASE_P(DISABLED_smoke_ConvertMatmulToPointwiseConvTest, ConvertMatmulToPointwiseConvWithFq,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(inputShape),
+        ::testing::ValuesIn(fqStats)),
+    ConvertMatmulToPointwiseConvWithFq::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
@ -18,19 +18,21 @@
 typedef std::tuple<
        InferenceEngine::Precision,         // Network Precision
        std::string,                        // Target Device
-        std::map<std::string, std::string>  //Configuration
+        std::map<std::string, std::string>, // Configuration
+        std::vector<size_t>                 // Input Shape
 > EltwiseSplitOverChannelsPassParams;

 namespace LayerTestsDefinitions {

 class EltwiseSplitOverChannelsPassTest : public testing::WithParamInterface<EltwiseSplitOverChannelsPassParams>,
-                                             public LayerTestsUtils::LayerTestsCommon {
+                                         public LayerTestsUtils::LayerTestsCommon {
 public:
    static std::string getTestCaseName(testing::TestParamInfo<EltwiseSplitOverChannelsPassParams> obj) {
        InferenceEngine::Precision netPrecision;
        std::string targetDevice;
        std::map<std::string, std::string> configuration;
-        std::tie(netPrecision, targetDevice, configuration) = obj.param;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = obj.param;

        std::ostringstream result;
        result << "netPRC=" << netPrecision.name() << "_";
@ -38,20 +40,22 @@ public:
        for (auto const& configItem : configuration) {
            result << "_configItem=" << configItem.first << "_" << configItem.second;
        }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
        return result.str();
    }

 protected:
    void SetUp() override {
        InferenceEngine::Precision netPrecision;
-        std::tie(netPrecision, targetDevice, configuration) = this->GetParam();
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = this->GetParam();
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);

-        auto params = ngraph::builder::makeParams(ngPrc, { {1, 67000} });
-        auto const_mult2 = ngraph::builder::makeConstant<float>(ngPrc, {1, 67000}, {-1.0f});
+        auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+        auto const_mult2 = ngraph::builder::makeConstant<float>(ngPrc, inputShape, {-1.0f});

        auto sum = ngraph::builder::makeEltwise(params[0], const_mult2, ngraph::helpers::EltwiseTypes::MULTIPLY);
-        function = std::make_shared<ngraph::Function>(sum, params, "RemovePermutationPass");
+        function = std::make_shared<ngraph::Function>(sum, params, "EltwiseSplitOverChannelsPassTest");
    }
 };

@ -71,11 +75,17 @@ const std::vector<std::map<std::string, std::string>> configs = {
        }
 };

+const std::vector<std::vector<size_t>> inputShape = {
+    {1, 67000},
+    {1, 500000}
+};
+
 INSTANTIATE_TEST_CASE_P(smoke_EltwiseSplitOverChennels, EltwiseSplitOverChannelsPassTest,
                        ::testing::Combine(
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(CommonTestUtils::DEVICE_GNA),
-                                ::testing::ValuesIn(configs)),
+                                ::testing::ValuesIn(configs),
+                                ::testing::ValuesIn(inputShape)),
                        EltwiseSplitOverChannelsPassTest::getTestCaseName);

 } // namespace LayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
@ -60,8 +60,6 @@ std::vector<std::string> disabledTestPatterns() {
        R"(.*ConstantResultSubgraphTest.*inPrc=(U8|I8|I32|U64|I64|BOOL).*)",
        // TODO: Issue 51528
        R"(.*CachingSupport.*_(u8|i16)_.*)",
-        // TODO: Issue 51527
-        R"(.*CachingSupport.*_batch2_.*)",
        // TODO: Issue 51525
        R"(.*CachingSupport.*KSOFunction.*)",
        // TODO: Issue 57363 (Param -> Result subgraphs)
--- a/inference-engine/tests/functional/plugin/shared/src/base/import_export_base/import_export_base.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/base/import_export_base/import_export_base.cpp
@ -69,13 +69,16 @@ void ImportNetworkTestBase::Run() {

    for (const auto& next_input : importedExecNetwork.GetInputsInfo()) {
        ASSERT_NO_THROW(compiledExecNetwork.GetInputsInfo()[next_input.first]);
+        Compare(next_input.second->getTensorDesc(), compiledExecNetwork.GetInputsInfo()[next_input.first]->getTensorDesc());
    }
    for (const auto& next_output : importedExecNetwork.GetOutputsInfo()) {
        ASSERT_NO_THROW(compiledExecNetwork.GetOutputsInfo()[next_output.first]);
    }
    auto importedOutputs = GetOutputs();
    ASSERT_EQ(actualOutputs.size(), importedOutputs.size());
+
    for (size_t i = 0; i < actualOutputs.size(); i++) {
+        Compare(actualOutputs[i]->getTensorDesc(), importedOutputs[i]->getTensorDesc());
        Compare(actualOutputs[i], importedOutputs[i]);
    }
 }
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
@ -72,6 +72,8 @@ public:

    virtual void Compare(const InferenceEngine::Blob::Ptr &expected, const InferenceEngine::Blob::Ptr &actual);

+    virtual void Compare(const InferenceEngine::TensorDesc &actualDesc, const InferenceEngine::TensorDesc &expectedDesc);
+
    virtual void SetRefMode(RefMode mode);

    std::shared_ptr<ngraph::Function> GetFunction();
--- a/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
@ -274,6 +274,17 @@ void LayerTestsCommon::Compare(const InferenceEngine::Blob::Ptr &expected, const
    }
 }

+void LayerTestsCommon::Compare(const InferenceEngine::TensorDesc &actualDesc, const InferenceEngine::TensorDesc &expectedDesc) {
+    auto expectedDims = actualDesc.getDims();
+    auto actualDims = expectedDesc.getDims();
+    ASSERT_EQ(actualDims.size(), expectedDims.size());
+    for (size_t j = 0; j < actualDims.size(); ++j) {
+        ASSERT_EQ(actualDims.at(j), expectedDims.at(j));
+    }
+    ASSERT_EQ(actualDesc.getLayout(), expectedDesc.getLayout());
+    ASSERT_EQ(actualDesc.getPrecision(), expectedDesc.getPrecision());
+}
+
 void LayerTestsCommon::ConfigureNetwork() {
    for (const auto &in : cnnNetwork.getInputsInfo()) {
        if (inLayout != InferenceEngine::Layout::ANY) {
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@ -1176,9 +1176,6 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
    size_t opt_deconv_layers_b_fs_zyx_fsv16 = 0;
    size_t total_crop_layers = 0;

-    size_t weighted_sum_feature_size = 0;
-    size_t weight_sum = 0;
-
    for (auto& node : get_processing_order()) {
        auto &prim = *node;
        if (prim.type() == cldnn::convolution::type_id()) {
@ -1324,35 +1321,4 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {

    if (should_use_bs_fs_yx_bsv16_fsv16)
        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bs_fs_yx_bsv16_fsv16_network, 1);
-
-
-    // This is to avoid using fsv16 for shallow-feature networks.
-    // This may not be exactly same as real execution graph as layer fusing is not done yet,
-    // but it is a reasonable approximation.
-    // Check the expected network efficiency after setting layer optimization attributes.
-    // If network depth is shallow, it is faster with fsv4.
-    for (auto& node : get_processing_order()) {
-        auto &prim = *node;
-
-        if (prim.is_in_data_flow() && prim.type() == cldnn::convolution::type_id()) {
-            size_t num_feature = prim.get_output_layout().size.feature.vector()[0];
-            size_t num_spatial = 1;
-            for (auto s : prim.get_output_layout().size.spatial.vector())
-                num_spatial *= s;
-
-            if (lo.get_preferred_format(prim) != format::b_fs_yx_fsv4) {
-                weight_sum += num_spatial;
-                weighted_sum_feature_size += num_spatial * num_feature;
-            }
-        }
-    }
-
-    size_t weighted_average_feature_depth = weighted_sum_feature_size / std::max(weight_sum, static_cast<size_t>(1));
-
-    // Need to confirm that weighted_average_feature_depth > 1 to keep unittest behavior.
-    if (is_quantized_int8_model && weighted_average_feature_depth < 8 && weighted_average_feature_depth > 1) {
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::fs_b_yx_fsv32_network, 0);
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::b_fs_yx_fsv16_network, 0);
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bs_fs_yx_bsv16_fsv16_network, 0);
-    }
 }
--- a/ngraph/core/include/ngraph/op/util/op_types.hpp
+++ b/ngraph/core/include/ngraph/op/util/op_types.hpp
@ -34,6 +34,8 @@ namespace ngraph
        NGRAPH_API
        bool is_output(const ngraph::Node* node);
        NGRAPH_API
+        bool is_sink(const ngraph::Node* node);
+        NGRAPH_API
        bool is_constant(const ngraph::Node* node);
        NGRAPH_API
        bool is_commutative(const ngraph::Node* node);
@ -60,6 +62,8 @@ namespace ngraph
        NGRAPH_API
        bool is_output(const std::shared_ptr<ngraph::Node>& node);
        NGRAPH_API
+        bool is_sink(const std::shared_ptr<ngraph::Node>& node);
+        NGRAPH_API
        bool is_constant(const std::shared_ptr<ngraph::Node>& node);
        NGRAPH_API
        bool is_commutative(const std::shared_ptr<ngraph::Node>& node);
--- a/ngraph/core/src/op/util/op_types.cpp
+++ b/ngraph/core/src/op/util/op_types.cpp
@ -76,6 +76,11 @@ bool ngraph::op::is_output(const ngraph::Node* node)
    return dynamic_cast<const ngraph::op::Result*>(node) != nullptr;
 }

+bool ngraph::op::is_sink(const ngraph::Node* node)
+{
+    return dynamic_cast<const ngraph::op::Sink*>(node) != nullptr;
+}
+
 bool ngraph::op::is_constant(const ngraph::Node* node)
 {
    return dynamic_cast<const ngraph::op::Constant*>(node) != nullptr;
@ -134,6 +139,10 @@ bool ngraph::op::is_output(const std::shared_ptr<ngraph::Node>& node)
 {
    return is_output(node.get());
 }
+bool ngraph::op::is_sink(const std::shared_ptr<ngraph::Node>& node)
+{
+    return is_sink(node.get());
+}
 bool ngraph::op::is_constant(const std::shared_ptr<ngraph::Node>& node)
 {
    return is_constant(node.get());
--- a/ngraph/python/tox.ini
+++ b/ngraph/python/tox.ini
@ -7,7 +7,7 @@ skip_install=True
 deps =
  -rrequirements.txt
  -rrequirements_test.txt
-  mypy
+  mypy<0.900
  flake8-bugbear
  pytest-xdist
 setenv =
--- a/ngraph/test/CMakeLists.txt
+++ b/ngraph/test/CMakeLists.txt
@ -288,7 +288,7 @@ set_source_files_properties(includes.cpp PROPERTIES COMPILE_DEFINITIONS
 if (ENABLE_MKL_DNN)
    message(STATUS "NGRAPH_TESTS: IE:CPU enabled")
    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} "IE:CPU")
-    if (NOT ENABLE_STRICT_DEPENDENCIES)
+    if (ENABLE_STRICT_DEPENDENCIES)
        # For convinience add a runtime dependency to build along with this target.
        # Warning: Parallel build with -GNinja may not be efficient.
        list(APPEND UNIT_TESTS_DEPENDENCIES MKLDNNPlugin)
@ -298,7 +298,7 @@ endif()
 if (ENABLE_CLDNN)
    message(STATUS "NGRAPH_TESTS: IE:GPU enabled")
    set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} "IE:GPU")
-    if (NOT ENABLE_STRICT_DEPENDENCIES)
+    if (ENABLE_STRICT_DEPENDENCIES)
        # For convinience add a runtime dependency to build along with this target.
        # Warning: Parallel build with -GNinja may not be efficient.
        list(APPEND UNIT_TESTS_DEPENDENCIES clDNNPlugin)