diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml
index f45f4e410c6..146775f6189 100644
--- a/.ci/azure/linux.yml
+++ b/.ci/azure/linux.yml
@@ -112,6 +112,7 @@ jobs:
         -DNGRAPH_ONNX_IMPORT_ENABLE=ON
         -DNGRAPH_ONNX_EDITOR_ENABLE=ON
         -DENABLE_FASTER_BUILD=ON
+        -DENABLE_STRICT_DEPENDENCIES=OFF
         -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules
         $(REPO_DIR)
       workingDirectory: $(BUILD_DIR)
diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml
index 680ef281ac2..04d4c16ea23 100644
--- a/.ci/azure/mac.yml
+++ b/.ci/azure/mac.yml
@@ -90,7 +90,7 @@ jobs:
       # Disable errors with Ninja
       export CXXFLAGS="-Wno-error=unused-command-line-argument"
       export CFLAGS="-Wno-error=unused-command-line-argument"
-      cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR)
+      cmake -GNinja -DVERBOSE_BUILD=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_PYTHON=ON -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)/modules $(REPO_DIR)
     workingDirectory: $(BUILD_DIR)
     displayName: 'CMake'
 
diff --git a/.ci/azure/windows.yml b/.ci/azure/windows.yml
index 6b4e5203dd0..21a36392e33 100644
--- a/.ci/azure/windows.yml
+++ b/.ci/azure/windows.yml
@@ -92,7 +92,7 @@ jobs:
 
   - script: |
       set PATH=$(WORK_DIR)\ninja-win;%PATH%
-      call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR)
+      call "$(MSVS_VARS_PATH)" && cmake -GNinja -DENABLE_FASTER_BUILD=ON -DENABLE_TEMPLATE_PLUGIN=ON -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) -DENABLE_TESTS=ON -DENABLE_STRICT_DEPENDENCIES=OFF -DIE_EXTRA_MODULES=$(OPENVINO_CONTRIB_REPO_DIR)\modules -DCMAKE_C_COMPILER:PATH="$(MSVC_COMPILER_PATH)" -DCMAKE_CXX_COMPILER:PATH="$(MSVC_COMPILER_PATH)" $(REPO_DIR)
     workingDirectory: $(BUILD_DIR)
     displayName: 'CMake'
 
diff --git a/cmake/features.cmake b/cmake/features.cmake
index 7518c99c868..aff805adb15 100644
--- a/cmake/features.cmake
+++ b/cmake/features.cmake
@@ -6,7 +6,7 @@ ie_dependent_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON "X8
 
 ie_option (ENABLE_TESTS "unit, behavior and functional tests" OFF)
 
-ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" OFF)
+ie_option (ENABLE_STRICT_DEPENDENCIES "Skip configuring \"convinient\" dependencies for efficient parallel builds" ON)
 
 ie_dependent_option (ENABLE_CLDNN "clDnn based plugin for inference engine" ON "X86_64;NOT APPLE;NOT MINGW;NOT WINDOWS_STORE;NOT WINDOWS_PHONE" OFF)
 
diff --git a/docs/IE_DG/Intro_to_Performance.md b/docs/IE_DG/Intro_to_Performance.md
index 94d0173dbbe..0c9457ed4bf 100644
--- a/docs/IE_DG/Intro_to_Performance.md
+++ b/docs/IE_DG/Intro_to_Performance.md
@@ -22,7 +22,8 @@ $ benchmark_app -m <model.xml> -enforcebf16=false
 Notice that for quantized (e.g. INT8) models the bfloat16 calculations (of the layers that remain in FP32) is disabled by default.
 Refer to the [CPU Plugin documentation](supported_plugins/CPU.md) for more details.
 
-Similarly, the GPU device has a dedicated config key to enable FP16 execution of the layers that remain in FP32 in the quantized models (as the quantization is typically performed on the FP32 models), refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md)
+Similarly, the GPU device automatically executes FP16 for the layers that remain in FP16 in the quantized models (assuming that the FP16 model was quantized).
+Refer to the ENABLE_FP16_FOR_QUANTIZED_MODELS key in the [GPU Plugin documentation](supported_plugins/GPU.md).
 
 ## Latency vs. Throughput
 One way to increase computational efficiency is batching, which combines many (potentially tens) of
diff --git a/inference-engine/include/ie_version.hpp b/inference-engine/include/ie_version.hpp
index 13215d0b68d..10e649a09d3 100644
--- a/inference-engine/include/ie_version.hpp
+++ b/inference-engine/include/ie_version.hpp
@@ -20,8 +20,8 @@
  * @brief Defines Inference Engine patch version
  */
 
-#define IE_VERSION_MAJOR 2021
-#define IE_VERSION_MINOR 4
+#define IE_VERSION_MAJOR 2022
+#define IE_VERSION_MINOR 1
 #define IE_VERSION_PATCH 0
 
 #include "ie_api.h"
diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
index 9b0eccaea59..3a283cae895 100644
--- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
+++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
@@ -10,13 +10,18 @@
 namespace GNAPluginNS {
 namespace GNALimitations {
 
+constexpr uint32_t bufferMaxSize = 65528;
+
 constexpr uint32_t convMinFiltersNum = 4;
 constexpr uint32_t convMaxFiltersNum = 65532;
 constexpr uint32_t convFiltersNumDivider = 4;
+constexpr uint32_t convFilterMaxSize = 768;
 constexpr uint32_t convEachKernelByteAlignment = 16;
 constexpr uint32_t noOfInputsDivisor = 8;
 constexpr uint32_t noOfInputsLowPrecDivisor = 16;
 
+constexpr uint32_t affineMaxBatchSize = 8;
+
 namespace Cnn2D {
 struct RangeLimit {
     uint32_t min;
diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
index 3c1fdaac0e7..11f13a7a9ac 100644
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -370,14 +370,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
             auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front();
             auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
             auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
-            auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue));
 
             result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
-            if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) {
-                result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax);
-            }
-            //
-            //result = MAX_VAL_2B_FEAT / absMax;
             if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
                 result = max_activation_scale_factor;
             }
@@ -401,6 +395,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
                 (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
                 auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
                 if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
+                    prevLayerQuant->_src_quant.IsStatsSet() &&
                     (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
                     result = prevLayerQuant->_src_quant.GetScale();
                     usePrevScaleFactor = true;
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index 23685b4734f..bf44e437af0 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -158,25 +158,27 @@ void GNAGraphCompiler::fillSplitConnections(InferenceEngine::CNNLayerPtr layer)
                 THROW_GNA_LAYER_EXCEPTION(layer) << " outData["<< i << "]" << " connected by " << j <<" connection doesnt connect to functional layer";
             }
 
-            auto dataOutput = outFunctionalLayer.first->insData[outFunctionalLayer.second].lock();
+            for (int idx : outFunctionalLayer.second) {
+                auto dataOutput = outFunctionalLayer.first->insData[idx].lock();
 
-            padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize())
-                                                        * dataOutput->getPrecision().size();
-            output_layer_size =
-                    InferenceEngine::details::product(begin(dataOutput->getDims()),
-                                                     end(dataOutput->getDims())) * dataOutput->getPrecision().size();
+                padding = std::max(padding, LayerInfo(outFunctionalLayer.first).paddingSize())
+                                                            * dataOutput->getPrecision().size();
+                output_layer_size =
+                        InferenceEngine::details::product(begin(dataOutput->getDims()),
+                                                        end(dataOutput->getDims())) * dataOutput->getPrecision().size();
 
-            if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) {
-                size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset");
-                layerInfoItem.splitOutputLayers.emplace_back(
-                    outFunctionalLayer.first,
-                    outFunctionalLayer.second,
-                    aligned64_offset * dataOutput->getPrecision().size(),
-                    output_layer_size);
-            } else {
-                layerInfoItem.splitOutputLayers.emplace_back(
-                    outFunctionalLayer.first, outFunctionalLayer.second, split_size, output_layer_size);
-            }
+                if (LayerInfo(outFunctionalLayer.first).isAffineFilter()) {
+                    size_t aligned64_offset = outFunctionalLayer.first->GetParamAsInt("offset");
+                    layerInfoItem.splitOutputLayers.emplace_back(
+                        outFunctionalLayer.first,
+                        idx,
+                        aligned64_offset * dataOutput->getPrecision().size(),
+                        output_layer_size);
+                } else {
+                    layerInfoItem.splitOutputLayers.emplace_back(
+                        outFunctionalLayer.first, idx, split_size, output_layer_size);
+                }
+             }
         }
 
         // in case of unconnected split - we need properly increment size
diff --git a/inference-engine/src/gna_plugin/gna_graph_tools.hpp b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
index e9cf70790ac..51701268209 100644
--- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
@@ -155,14 +155,14 @@ inline InferenceEngine::CNNLayerPtr  CNNNetPrevLayerSkipCertain(Layer layer, int
  */
 
 template <class Layer>
-inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck,
+inline std::pair<InferenceEngine::CNNLayerPtr, std::vector<int>>  CNNNetCheckNextLayerSkipCertain(Layer layer, int oidx, int iidx, bool bOnlyCheck,
                                                                 const std::function<bool(CNNLayerPtr)> &shouldSkip) {
     if (oidx >= layer->outData.size()) {
-        if (bOnlyCheck) return {nullptr, 0};
+        if (bOnlyCheck) return {nullptr, {}};
         THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx;
     }
     if (getInputTo(layer->outData[oidx]).empty() || iidx >= getInputTo(layer->outData[oidx]).size()) {
-        if (bOnlyCheck) return {nullptr, 0};
+        if (bOnlyCheck) return {nullptr, {}};
         THROW_GNA_LAYER_EXCEPTION(layer) << " no next output layer for outdata: " << oidx << " and inputTo index: " << iidx;
     }
 
@@ -174,12 +174,12 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer
 
     while (shouldSkip(outLayer->second)) {
         if (outLayer->second->outData.size() <= new_oidx) {
-            if (bOnlyCheck) return { nullptr, 0 };
+            if (bOnlyCheck) return { nullptr, {} };
             THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx;
         }
 
         if (getInputTo(outLayer->second->outData[new_oidx]).size() <= new_iidx) {
-            if (bOnlyCheck) return { nullptr, 0 };
+            if (bOnlyCheck) return { nullptr, {} };
             THROW_GNA_LAYER_EXCEPTION(outLayer->second) << " no next output layer for outdata: " << new_oidx << " and inputTo index: " << new_iidx;
         }
 
@@ -188,11 +188,7 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer
     }
 
     auto insDataIdx = CNNLayerFindInsDataIdxes(layer->outData[new_oidx], outLayer->second);
-    if (insDataIdx.size() != 1) {
-        if (bOnlyCheck) return { nullptr, 0 };
-        THROW_GNA_LAYER_EXCEPTION(layer) << " has multiple connection to " << new_oidx << " outData";
-    }
-    return { outLayer->second, insDataIdx.front() };
+    return { outLayer->second, insDataIdx };
 }
 
 /**
@@ -256,7 +252,7 @@ inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetCheckNextLayerSkipCer
 
 /// @brief alias for strict checkNextLayer (false)
 template <class Layer>
-inline std::pair<InferenceEngine::CNNLayerPtr, int>  CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx,
+inline std::pair<InferenceEngine::CNNLayerPtr, std::vector<int>>  CNNNetGetNextLayerSkipCertain(Layer layer, int oidx, int iidx,
                                                                                const std::function<bool(CNNLayerPtr)> &shouldSkip) {
     return CNNNetCheckNextLayerSkipCertain(layer, oidx, iidx, false, shouldSkip);
 }
diff --git a/inference-engine/src/gna_plugin/gna_groups.hpp b/inference-engine/src/gna_plugin/gna_groups.hpp
index 21abe5d0124..2449338821c 100644
--- a/inference-engine/src/gna_plugin/gna_groups.hpp
+++ b/inference-engine/src/gna_plugin/gna_groups.hpp
@@ -46,14 +46,10 @@ inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input
  * @param layer
  */
 inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
-    if (GNAPluginNS::LayerInfo(layer).isPower())
+    if (GNAPluginNS::LayerInfo(layer).isPower() || GNAPluginNS::LayerInfo(layer).isCopy())
         return true;
 
-    if (!GNAPluginNS::LayerInfo(layer).isScaleShift())
-        return false;
-
-    // Don't reshape user-defined ScaleShift layers
-    if (layer->name.rfind("SyntheticScaleShift", 0) == std::string::npos)
+    if (!GNAPluginNS::LayerInfo(layer).isSyntheticScaleShift())
         return false;
 
     // Don't reshape the first dnn layer since it breaks groups recognition
@@ -61,8 +57,7 @@ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
         return LayerInfo(ptr).isNonValuesChangable();
     });
     IE_ASSERT(prevLayer != nullptr);
-    if (LayerInfo(prevLayer).isInput())
-        return false;
+    if (LayerInfo(prevLayer).isInput()) return false;
 
     // Don't reshape diagonallayers with bias connection
     return !GNAPluginNS::LayerInfo(getCreatorLayer(layer->insData.front().lock()).lock()).has32BOutput();
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp
index fdb99d7f273..e32ded8a9e3 100644
--- a/inference-engine/src/gna_plugin/gna_model_serial.cpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp
@@ -17,6 +17,7 @@
 #include <mm_malloc.h>
 #include <serial/headers/2dot2/gna_model_header.hpp>
 #include <serial/headers/2dot5/gna_model_header.hpp>
+#include <serial/headers/2dot6/gna_model_header.hpp>
 
 #endif
 
@@ -133,10 +134,11 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
                 }
                 case 5:
                 case 6:
+                case 7:
                     readNBytes(&header, sizeof(HeaderLatest::ModelHeader), is);
                     break;
                 default:
-                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 4 and is: " << header.version.minor;
+                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << header.version.minor;
             }
             break;
         default:
@@ -154,6 +156,40 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
     return header;
 }
 
+GNAPluginNS::HeaderLatest::RuntimeEndPoint GNAModelSerial::ReadEndPoint(std::istream &is) {
+    is.exceptions(std::istream::failbit);
+
+    HeaderLatest::RuntimeEndPoint endPoint;
+    switch (modelHeader.version.major) {
+        case 2:
+            switch (modelHeader.version.minor) {
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                {
+                    Header2dot6::RuntimeEndPoint tempEndPoint2dot6;
+                    readBits(tempEndPoint2dot6, is);
+                    endPoint = HeaderLatest::RuntimeEndPoint(tempEndPoint2dot6, modelHeader.nGroup);
+                    break;
+                }
+                case 7:
+                    readNBytes(&endPoint, sizeof(HeaderLatest::RuntimeEndPoint), is);
+                    break;
+                default:
+                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 7 and is: " << modelHeader.version.minor;
+            }
+            break;
+        default:
+            THROW_GNA_EXCEPTION << "Imported file unsupported. Import for files with major version equal to: "
+            << modelHeader.version.major << " is not implemented";
+    }
+
+    return endPoint;
+}
+
 #define offsetFromBase(field)\
 getOffsetFromBase(field, #field)
 
@@ -324,18 +360,6 @@ void GNAModelSerial::Import(void *basePointer,
     is.read(reinterpret_cast<char*>(basePointer), gnaGraphSize);
 }
 
-
-uint32_t guessGrouping(Gna2Model const& model) {
-    if (model.NumberOfOperations == 0 ||
-        model.Operations == nullptr ||
-        model.Operations[0].Operands == nullptr ||
-        model.Operations[0].NumberOfOperands == 0 ||
-        model.Operations[0].Operands[0]->Shape.NumberOfDimensions < 2) {
-        THROW_GNA_EXCEPTION << "Can not guess grouping";
-    }
-    return (std::min)(model.Operations[0].Operands[0]->Shape.Dimensions[0], model.Operations[0].Operands[0]->Shape.Dimensions[1]);
-}
-
 void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const {
     os.exceptions(std::ostream::failbit);
 
@@ -366,6 +390,9 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
         out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
         out.scaleFactor = ep.scaleFactor;
         out.element_size = ep.element_size;
+        out.shape = ep.shape;
+        out.layout = ep.layout;
+        out.precision = ep.precision;
         out.orientation = ep.orientation;
         return out;
     };
@@ -381,7 +408,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
     header.headerSize = sizeof(HeaderLatest::ModelHeader);
     header.gnaMemSize = gnaGraphSize;
     header.layersCount = layers.size();
-    header.nGroup = guessGrouping(*gna2Model);
+    header.nGroup = 1; // just to support the old models
     header.nInputs = inputs.size();
     header.nOutputs = outputs.size();
     header.nTransposeInputs = transposeInputsInfo.size();
@@ -796,13 +823,22 @@ std::vector<HeaderLatest::RuntimeEndPoint> GNAModelSerial::serializeOutputs(cons
     std::size_t outputIndex = 0;
     for (auto const &output : outputsDataMap) {
         auto outputName = output.first;
-        auto inputDims = output.second->getTensorDesc().getDims();
-        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(inputDims.begin(), inputDims.end()));
-
+        auto outputDims = output.second->getTensorDesc().getDims();
+        HeaderLatest::RuntimeEndPoint::Shape outputShape;
+        outputShape.NumberOfDimensions = outputDims.size();
+        for (size_t i=0; i < outputShape.NumberOfDimensions; ++i) {
+            outputShape.Dimensions[i] = static_cast<uint32_t>(outputDims[i]);
+        }
+        uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(outputDims.begin(), outputDims.end()));
+        InferenceEngine::Layout outputLayout = output.second->getLayout();
+        InferenceEngine::Precision::ePrecision outputPrecision = InferenceEngine::Precision::FP32;
         HeaderLatest::RuntimeEndPoint endPoint(outputsDesc[outputIndex].scale_factor,
                                                  outputsDesc[outputIndex].ptrs[0],
                                                  outputsDesc[outputIndex].num_bytes_per_element,
                                                  elementsCount,
+                                                 outputShape,
+                                                 outputLayout,
+                                                 outputPrecision,
                                                  outputsDesc[outputIndex].orientation);
         endPoints.push_back(endPoint);
         outputIndex++;
@@ -818,18 +854,26 @@ std::vector<HeaderLatest::RuntimeEndPoint> GNAModelSerial::serializeInputs(const
     for (auto const& input : inputsDataMap) {
         auto inputName = input.first;
         auto inputDims = input.second->getTensorDesc().getDims();
-
+        HeaderLatest::RuntimeEndPoint::Shape inputShape;
+        inputShape.NumberOfDimensions = inputDims.size();
+        for (size_t i=0; i < inputShape.NumberOfDimensions; ++i) {
+            inputShape.Dimensions[i] = static_cast<uint32_t>(inputDims[i]);
+        }
         double scaleFactor = inputDesc->getScaleFactor(inputIndex);
         std::vector<void *> descriptor_ptr = inputDesc->getPtrInputsGlobal(inputName);
         IE_ASSERT(descriptor_ptr.size() > 0);
         uint32_t element_size = 2u;
         uint32_t elementsCount = static_cast<uint32_t>(InferenceEngine::details::product(inputDims.begin(), inputDims.end()));
         intel_dnn_orientation_t orientation = inputDesc->getOrientation(inputName);
-
+        InferenceEngine::Layout inputLayout = input.second->getLayout();
+        InferenceEngine::Precision::ePrecision inputPrecision = InferenceEngine::Precision::FP32;
         HeaderLatest::RuntimeEndPoint endPoint(scaleFactor,
                                                  descriptor_ptr[0],
                                                  element_size,
                                                  elementsCount,
+                                                 inputShape,
+                                                 inputLayout,
+                                                 inputPrecision,
                                                  orientation);
         endPoints.push_back(endPoint);
         inputIndex++;
@@ -846,20 +890,24 @@ void GNAModelSerial::ImportInputs(std::istream &is,
     for (uint32_t inputIndex = 0; inputIndex < modelHeader.nInputs; inputIndex++) {
         const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3)
                 ? inputNames.at(inputIndex) : std::string("input" + std::to_string(inputIndex));
-        HeaderLatest::RuntimeEndPoint input;
-        is.read(reinterpret_cast<char *>(&input), sizeof(input));
+
+        HeaderLatest::RuntimeEndPoint input = ReadEndPoint(is);
         inputsDesc->getPtrInputsGlobal(name).push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + input.descriptor_offset));
         inputsDesc->orientation_in[name] = input.orientation;
         inputsDesc->bytes_allocated_for_input[name] = input.element_size * input.elements_count;
 
-        auto inputDims = InferenceEngine::SizeVector({modelHeader.nGroup, input.elements_count / modelHeader.nGroup});
-
+        auto inputDims = InferenceEngine::SizeVector();
+        for (auto i = 0; i < input.shape.NumberOfDimensions; ++i) {
+            inputDims.push_back(input.shape.Dimensions[i]);
+        }
+        InferenceEngine::Layout inputLayout = static_cast<InferenceEngine::Layout>(input.layout);
+        InferenceEngine::Precision inputPresicion = InferenceEngine::Precision(static_cast<InferenceEngine::Precision::ePrecision>(input.precision));
         dataMap[name] = std::make_shared<InferenceEngine::InputInfo>();
         dataMap[name]->setInputData(std::make_shared<InferenceEngine::Data>(name,
                                                             InferenceEngine::TensorDesc(
-                                                                    InferenceEngine::Precision::FP32,
+                                                                    inputPresicion,
                                                                     inputDims,
-                                                                    InferenceEngine::Layout::NC)));
+                                                                    inputLayout)));
         inputsDesc->inputScaleFactors.push_back(input.scaleFactor);
     }
 }
@@ -875,8 +923,8 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
     for (uint32_t outputIndex = 0; outputIndex < modelHeader.nOutputs; outputIndex++) {
         const std::string& name = (modelHeader.version.major == 2 && modelHeader.version.minor >= 3)
                                   ? outputNames.at(outputIndex) : std::string("output" + std::to_string(outputIndex));
-        HeaderLatest::RuntimeEndPoint output;
-        is.read(reinterpret_cast<char *>(&output), sizeof(output));
+
+        HeaderLatest::RuntimeEndPoint output = ReadEndPoint(is);
         OutputDesc description;
         description.ptrs.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + output.descriptor_offset));
         description.orientation = kDnnInterleavedOrientation;
@@ -884,12 +932,17 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
         description.num_bytes_per_element = output.element_size;
         description.scale_factor = output.scaleFactor;
 
-        auto outputDims = InferenceEngine::SizeVector({modelHeader.nGroup, output.elements_count / modelHeader.nGroup});
+        auto outputDims = InferenceEngine::SizeVector();
+        for (auto i = 0; i < output.shape.NumberOfDimensions; ++i) {
+            outputDims.push_back(output.shape.Dimensions[i]);
+        }
+        InferenceEngine::Layout outputLayout = static_cast<InferenceEngine::Layout>(output.layout);
+        InferenceEngine::Precision outputPresicion =  InferenceEngine::Precision(static_cast<InferenceEngine::Precision::ePrecision>(output.precision));
         dataMap[name] = std::make_shared<InferenceEngine::Data>(name,
                                                  InferenceEngine::TensorDesc(
-                                                         InferenceEngine::Precision::FP32,
+                                                         outputPresicion,
                                                          outputDims,
-                                                         InferenceEngine::Layout::NC));
+                                                         outputLayout));
         desc.at(outputIndex) = description;
     }
 }
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.hpp b/inference-engine/src/gna_plugin/gna_model_serial.hpp
index d756a23f9fc..f5310d826c4 100644
--- a/inference-engine/src/gna_plugin/gna_model_serial.hpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp
@@ -138,6 +138,8 @@ private:
      */
     static GNAPluginNS::HeaderLatest::ModelHeader ReadHeader(std::istream &is);
 
+    GNAPluginNS::HeaderLatest::RuntimeEndPoint ReadEndPoint(std::istream &is);
+
     /**
      * @brief Import model from FS into preallocated buffer,
      * buffers for pLayers, and pStructs are allocated here and required manual deallocation using mm_free
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index e76eafa6d53..f49d543def1 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -54,12 +54,17 @@
 #include <transformations/common_optimizations/pull_transpose_through_fq.hpp>
 #include <transformations/common_optimizations/relu_fake_quantize_fusion.hpp>
 #include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
+#include <transformations/utils/utils.hpp>
 
 #include "transformations/remove_extra_reshapes.hpp"
 #include "transformations/insert_transpose_after_convolution_or_pooling.hpp"
 #include "transformations/insert_transpose_before_matmul.hpp"
 #include "transformations/reorder_activation_and_pooling.hpp"
 #include "transformations/swap_input_matmul_gna.hpp"
+#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
+#include "transformations/split_convolution_with_large_buffer_size.hpp"
+
+#include <ngraph/opsets/opset7.hpp>
 
 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
@@ -667,6 +672,15 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
         // WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
         manager.register_pass<ngraph::pass::ConvertPriorBox>();
         manager.register_pass<ngraph::pass::CommonOptimizations>();
+        // TODO enable this transformation for networks with convolutions
+        if (!ngraph::op::util::has_op_with_type<ngraph::opset7::Convolution>(graph)) {
+            manager.register_pass<ConvertMatmulWithFqToPointWiseConvolution>();
+            manager.register_pass<ConvertMatmulWithBiasToPointWiseConvolution>();
+            manager.register_pass<ConvertMatmulToPointWiseConvolution>();
+        }
+        manager.register_pass<SplitConvolutionWithFq>();
+        manager.register_pass<SplitConvolutionWithBias>();
+        manager.register_pass<SplitConvolution>();
         manager.register_pass<InsertTransposeBeforeMatmul>();
         manager.register_pass<SwapInputMatMul>();
         manager.register_pass<InsertTransposeAfterConvOrPool>();
@@ -735,6 +749,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
         passes->registerPass<SubstitutePReluPass>();
         passes->registerPass<SubstituteSoftSignPass>();
 
+        passes->registerPass<BroadcastConstPass>();
         passes->registerPass<ReorderMaxPoolPass>();
         passes->registerPass<EltwiseSplitOverChannelsPass>();
         passes->registerPass<InsertSplitAligningFilterPass>();
@@ -753,7 +768,6 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
 
         passes->registerPass<InsertIdentityLayerPass>();
         passes->registerPass<BreakFusingOfOutputLayersPass>();
-        passes->registerPass<BroadcastConstPass>();
         passes->registerPass<InsertDiagonalLayerPass>();
         passes->registerPass<HandleMultipleActivationsForTheLayerPass>();
 #if GNA_LIB_VER == 2
@@ -1465,7 +1479,11 @@ static InferenceEngine::Layout GetLayoutForDims(const InferenceEngine::SizeVecto
 Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) {
     // need to have intermediate blob for interleave conversion
     InferenceEngine::Blob::Ptr outputBlob;
-    auto outputDims = outputsDataMap[name]->getTensorDesc().getDims();
+    auto outputDataIt = outputsDataMap.find(name);
+    if (outputDataIt == std::end(outputsDataMap)) {
+        THROW_GNA_EXCEPTION << "Output " << name << " isn't found";
+    }
+    auto outputDims = outputDataIt->second->getTensorDesc().getDims();
     outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, GetLayoutForDims(outputDims)));
     outputBlob->allocate();
     return outputBlob;
@@ -1475,7 +1493,11 @@ Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Prec
     InferenceEngine::Blob::Ptr inputBlob;
     // need to have intermediate blob for interleave conversion
     // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
-    auto inputDims = inputsDataMap[name]->getTensorDesc().getDims();
+    auto inputDataIt = inputsDataMap.find(name);
+    if (inputDataIt == std::end(inputsDataMap)) {
+        THROW_GNA_EXCEPTION << "Input " << name << " isn't found";
+    }
+    auto inputDims = inputDataIt->second->getTensorDesc().getDims();
     inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, GetLayoutForDims(inputDims)));
     inputBlob->allocate();
     return inputBlob;
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index 4d3b71b9622..b8962cebd36 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -86,7 +86,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
     });
     IE_ASSERT(inputLayer != nullptr);
     size_t weightsSize = (LayerInfo(prevLayer).has32BOutput() || LayerInfo(inputLayer).isInput()) ?
-                         weightsSize = nextLayer->outData[0]->getDims().back() :
+                         nextLayer->outData[0]->getDims().back() :
                          Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
     std::vector<float> weightsValues(weightsSize, fillValue);
     IE_ASSERT(diagLayer != nullptr);
@@ -314,6 +314,7 @@ void HandleMultipleActivationsForTheLayerPass::run() {
                 LayerInfo info(inputTo.second);
 
                 if (info.isActivation()) {
+                    if (odata->getDims().empty()) continue;
                     if (!activations.empty() && odata->getDims()[0] != 1) {
                         THROW_GNA_EXCEPTION << "Unsupported batch size " << odata->getDims()[0]
                                             << " for diagonal layer insertion";
@@ -741,12 +742,17 @@ void RemovePermutationsNHWCToNCHWPass::run() {
         IE_ASSERT(!input_to.empty());
         auto current_layer = input_to.begin()->second;
         setNHWCOrder(current_layer->input());
-        while (current_layer != pattern_end) {
-            setNHWCOrder(current_layer->outData[0]);
-            input_to = getInputTo(current_layer->outData[0]);
-            IE_ASSERT(!input_to.empty());
-            current_layer = input_to.begin()->second;
-        }
+        std::function<void(CNNLayerPtr)> propogateNHWCOrderRecursive =
+            [pattern_end, &propogateNHWCOrderRecursive, &setNHWCOrder](CNNLayerPtr current_layer) {
+            if (current_layer == pattern_end) return;
+            for (size_t i = 0; i < current_layer->outData.size(); ++i) {
+                setNHWCOrder(current_layer->outData[i]);
+                auto input_to = getInputTo(current_layer->outData[i]);
+                IE_ASSERT(!input_to.empty());
+                propogateNHWCOrderRecursive(input_to.begin()->second);
+            }
+        };
+        propogateNHWCOrderRecursive(current_layer);
 
         if (LayerInfo(pattern_start).isPermute() && !getInputTo(pattern_start->outData.front()).empty()) {
             auto layer_before_permute = CNNNetPrevLayer(pattern_start);
@@ -1447,21 +1453,19 @@ void EltwiseSplitOverChannelsPass::run() {
             THROW_GNA_LAYER_EXCEPTION(l) << "number of outputs expected to be 1";
         }
         auto oData = l->outData.front();
+        auto out_width = GetDataDimSize(oData, DataDimName::W);
         auto totalElementsForOutput = details::product(oData->getDims().begin(), oData->getDims().end());
         auto maxAffineElements = getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo;
         if (totalElementsForOutput <= maxAffineElements) {
             continue;
         }
 
-        // TODO: for now lets put split of 2 elements as restrictions
         auto totalSplits = 1 + totalElementsForOutput / maxAffineElements;
-        if (totalSplits > 2) {
-            THROW_GNA_LAYER_EXCEPTION(l) << "split layer over output channels on more than 2 layers unsupported";
-        }
 
         pass_trace() << "transforming " << LAYER_NAME(l) << " by splitting it to multiple eltwise operations\n";
         auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
 
+        bool sameInputs = l->insData[0].lock() == l->insData[1].lock();
         std::vector<CNNLayerPtr> splitLayers(2);
         for (size_t kThEltwiseInput = 0; kThEltwiseInput != 2; kThEltwiseInput++) {
             // create split layer
@@ -1472,31 +1476,38 @@ void EltwiseSplitOverChannelsPass::run() {
 
             split->insData.push_back(l->insData[kThEltwiseInput]);
             auto inputDesc = l->insData[kThEltwiseInput].lock()->getTensorDesc();
-            // need to split this desc
-            if (inputDesc.getLayout() != Layout::NC) {
-                THROW_GNA_LAYER_EXCEPTION(l)
-                << "cannot split over channel: input " << std::to_string(kThEltwiseInput)
-                << " layout need to be NC";
-            }
 
             // create split layer outputs
-            for (size_t i = 0;; i++) {
-                auto elements_num = std::min(totalElementsForOutput - i * maxAffineElements,
+            size_t usedElements = 0;
+            for (size_t i = 0; i < totalSplits; i++) {
+                SizeVector newDims;
+                size_t elements_num = std::min(totalElementsForOutput - usedElements,
                         static_cast<size_t>(maxAffineElements));
+                if (inputDesc.getDims().size() == 2) {
+                    newDims = SizeVector{1, elements_num};
+                } else {
+                    elements_num = elements_num - elements_num % out_width;
+                    newDims = SizeVector{1, elements_num / out_width, out_width};
+                }
 
-                SizeVector newDims = {1, elements_num};
                 auto newDesc = TensorDesc(inputDesc.getPrecision(), newDims, inputDesc.getLayout());
                 auto data = std::make_shared<Data>(l->name + "/" + std::to_string(kThEltwiseInput) + "/1", newDesc);
                 getCreatorLayer(data) = split;
                 split->outData.push_back(data);
 
-                if (elements_num != maxAffineElements) {
+                usedElements += elements_num;
+                if (usedElements == totalElementsForOutput) {
                     break;
                 }
             }
             // replacing connection X->eltwise to X->split
             auto oData = CNNLayerFindOutData(l, kThEltwiseInput);
             oData.second->second = split;
+
+            if (sameInputs) {
+                splitLayers[1] = splitLayers[0];
+                break;
+            }
         }
 
         // create concatlayer
@@ -1507,8 +1518,6 @@ void EltwiseSplitOverChannelsPass::run() {
         concat->outData.push_back(masterEltwise->outData.front());
         getCreatorLayer(masterEltwise->outData.front()) = concat;
 
-
-        // create new eltwise layers - here 2 hardcode
         for (size_t k = 0; k != totalSplits; k++) {
             auto eltwiseRaw = std::make_shared<EltwiseLayer>(
                     LayerParams{l->name + "/eltwise/" + std::to_string(k), "Eltwise", Precision::FP32});
@@ -1517,7 +1526,6 @@ void EltwiseSplitOverChannelsPass::run() {
             eltwiseRaw->coeff = masterEltwise->coeff;
             auto eltwise = quantized ? InferenceEngine::injectData<QuantizedLayerParams>(eltwiseRaw) : eltwiseRaw;
 
-
             eltwise->insData.push_back(splitLayers[0]->outData[k]);
             eltwise->insData.push_back(splitLayers[1]->outData[k]);
             getInputTo(splitLayers[0]->outData[k])[eltwise->name] = eltwise;
@@ -1529,6 +1537,15 @@ void EltwiseSplitOverChannelsPass::run() {
             auto data = std::make_shared<Data>(l->name + "/elwise/out/" + std::to_string(k), newDesc);
             getCreatorLayer(data) = eltwise;
             eltwise->outData.push_back(data);
+            if (quantized) {
+                auto eltwiseQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(eltwise);
+                if (quantized->_src_quant.IsStatsSet()) {
+                    eltwiseQuant->_src_quant.CopyStats(quantized->_src_quant);
+                }
+                if (quantized->_dst_quant.IsStatsSet()) {
+                    eltwiseQuant->_dst_quant.CopyStats(quantized->_dst_quant);
+                }
+            }
             getInputTo(data)[concat->name] = concat;
             concat->insData.push_back(data);
         }
@@ -1919,13 +1936,20 @@ void FuseFQIntoWeightsPass::run() {
         }
 
         GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
-        size_t layers_connected_to_fq_count = getInputTo(fqLayer->outData[0]).size();
+        auto inputTo = getInputTo(fqLayer->outData[0]);
+        size_t layers_connected_to_fq_count = inputTo.size();
+        auto layerBeforeWeightable = fqLayer;
+        while (layers_connected_to_fq_count == 1 && LayerInfo(inputTo.begin()->second).isNonFunctional()) {
+            layerBeforeWeightable = inputTo.begin()->second;
+            inputTo = getInputTo(layerBeforeWeightable->outData[0]);
+            layers_connected_to_fq_count = inputTo.size();
+        }
         for (int index = 0; index < layers_connected_to_fq_count; index++) {
-            auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, index, isNonFunctional).first;
+            auto weightableLayer = CNNNetGetNextLayerSkipCertain(layerBeforeWeightable, 0, index, isNonFunctional).first;
             if (!LayerInfo(weightableLayer).isWeightable()) {
                 continue;
             }
-            if (weightableLayer->insData.size() != 3) {
+            if (weightableLayer->insData.size() < 2) {
                 continue;
             }
 
@@ -1942,7 +1966,8 @@ void FuseFQIntoWeightsPass::run() {
             pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
                 << LAYER_NAME(weightableLayer) << "\n";
 
-            auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx);
+            auto biases = weightableLayer->insData.size() == 3 ?
+                LayerUtils::getParamFromInputAsBlob(weightableLayer, biasesIdx) : nullptr;
             auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();
 
             // 1. broke existing connections - by detaching fq subgraph from rest of graph
@@ -2149,8 +2174,11 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
         }
         GNAFakeQuantizeLayer fqLayer(l);
         auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
-        if (prevLayer->outData.size() != 1) {
-            THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
+        auto prevDataIt = std::find_if(std::begin(prevLayer->outData), std::end(prevLayer->outData), [l](DataPtr data) {
+            return getInputTo(data).find(l->name) != std::end(getInputTo(data));
+        });
+        if (prevDataIt == std::end(prevLayer->outData)) {
+            THROW_GNA_LAYER_EXCEPTION(fqLayer) << "Invalid connection between " << prevLayer->name << " and " << l->name;
         }
 
         auto inputRange = fqLayer.getInputRange();
@@ -2181,8 +2209,18 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
         quantParamsPrevLayer->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
         quantParamsPrevLayer->_dst_quant.SetMaxValues({ outputRange.second[0] }, false);
 
+        // Propogate destination statistics to multiply layer if it's set for the next sum/sub layer (is considered as bias)
+        if (LayerInfo(prevLayer).isEltwiseSum() || LayerInfo(prevLayer).isEltwiseSub()) {
+            auto eltwPrevLayer = CNNNetPrevLayerSkipCertain(prevLayer, 0, donotSkip);
+            auto constLayer = CNNNetPrevLayerSkipCertain(prevLayer, 1, donotSkip);
+            if (LayerInfo(eltwPrevLayer).isEltwise() && LayerInfo(constLayer).isConst()) {
+                auto quantParamsEltwLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(eltwPrevLayer);
+                quantParamsEltwLayer->_dst_quant.CopyStats(quantParamsPrevLayer->_dst_quant);
+            }
+        }
+
         auto fqQauntParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
-        fqQauntParams->_dst_quant.SetLevels(fqLevels);
+        fqQauntParams->_dst_quant.SetLevels(UINT16_MAX);
         fqQauntParams->_dst_quant.SetMinValues({ inputRange.first[0] }, true);
         fqQauntParams->_dst_quant.SetMaxValues({ inputRange.second[0] }, true);
         fqQauntParams->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
@@ -2198,7 +2236,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
         // FQ Layer is fused only when previous layer is const, memory or activation layer
         // or a next layer is activation layer.
         bool isFQFuseAllowed = allowFQFuse(l);
-        auto prevData = prevLayer->outData.front();
+        auto prevData = *prevDataIt;
 
         // Find all output layers connected to FQ
         auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
@@ -2207,7 +2245,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
         }
 
         if (isFQFuseAllowed) {
-            getInputTo(prevLayer->outData.front()).clear();
+            getInputTo(prevData).clear();
         }
 
         // Connect all next layers after FQ to the layer that is before FQ
@@ -2222,7 +2260,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
                 for (int insDataIdx : insDatas) {
                     nextLayers[i]->insData[insDataIdx] = prevData;
                 }
-                getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
+                getInputTo(prevData)[nextLayers[i]->name] = nextLayers[i];
             }
 
             propagateStatistics(quantParamsPrevLayer, nextLayers[i]);
diff --git a/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp b/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp
new file mode 100644
index 00000000000..14badf3adcf
--- /dev/null
+++ b/inference-engine/src/gna_plugin/serial/headers/2dot7/gna_model_header.hpp
@@ -0,0 +1,197 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include "backend/dnn_types.h"
+#include "serial/headers/2dot4/gna_model_header.hpp"
+#include "serial/headers/2dot6/gna_model_header.hpp"
+#include "serial/headers/latest/gna_model_header.hpp"
+#include "gna_data_types.hpp"
+
+#pragma pack(push, 1)
+
+namespace GNAPluginNS {
+namespace Header2dot7 {
+
+/**
+ Maximal number of supported shape dimensions.
+ */
+#define GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS 8
+
+/**
+ * @brief Header version 2.7
+ */
+struct ModelHeader {
+    /**
+     *@brief MagicNumber – GNAM in ascii table, equals to hex 0x474e414d
+     */
+    char gnam[4] = {};
+    /**
+     * @brief if header size is not equal to sizeof ModelHeader - some reserved data append in the end of header
+     * usually it is an indicator of working with version of model different that is current export function produce
+     */
+    uint32_t headerSize = 0u;
+    struct Version {
+        /**
+         * @details Version of format Major – unsigned int, ex: 0x0001
+         * every change in the header or in the layers definition should be reflected in version change
+         * for backward compatibility new parsers can read old versions of model with certain restrictions
+         */
+        uint16_t major = 2u;
+        /**
+         * @details Version of Format Minor – unsigned int,  corresponding to build revision for example
+         * changes in minor version are not affected layout of model
+         */
+        uint32_t minor = 7u;
+    } version;
+    /**
+     * @brief Memory required to be allocated using GNAAlloc()
+     */
+    uint64_t gnaMemSize = 0ull;
+    /**
+     * @brief Number of GNA Layers
+     */
+    uint64_t layersCount = 0ull;
+    /**
+     * @brief Grouping level
+     * This is depricted field and used for old models only (<=2.6)
+     */
+    uint32_t nGroup = 0u;
+
+    /**
+     * Convolution related setting - they are affecting input transformation
+     */
+    uint32_t nRotateRows = 0u;
+    uint32_t nRotateColumns = 0u;
+    bool doRotateInput = false;
+
+    uint32_t nInputs = 0u;
+    uint32_t nOutputs = 0u;
+
+    /**
+     * Convolution related setting - they are affecting output transformation
+     */
+    uint32_t nRotateOutputRows = 0u;
+    uint32_t nRotateOutputColumns = 0u;
+    bool doRotateOutput = false;
+
+    uint32_t nTransposeInputs = 0u;
+    uint32_t nTransposeOutputs = 0u;
+
+    /**
+     * Reserved Data might be here
+     */
+    ModelHeader() = default;
+    ModelHeader(GNAPluginNS::Header2dot1::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        version.minor = old.version.minor;
+    }
+    ModelHeader(GNAPluginNS::Header2dot4::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        nRotateOutputRows = old.nRotateOutputRows;
+        nRotateOutputColumns = old.nRotateOutputColumns;
+        doRotateOutput = old.doRotateOutput;
+        version.minor = old.version.minor;
+    }
+};
+#pragma pack(pop)
+
+/*
+ * In runtime endpoint mostly same as in serial version, except of descriptor field
+ */
+struct RuntimeEndPoint {
+    /**
+     * if scale factor is different then pased into infer , network might need to be requantized
+     */
+    float scaleFactor = 0;
+    /**
+     * Pointer descriptor
+     */
+    void* descriptor_ptr = nullptr;
+    /**
+     * Endpoint resolution in bytes.
+     */
+    uint32_t element_size = 0;
+    /**
+     * Number of elements
+     */
+    uint32_t elements_count = 0;
+    /**
+     * Offset in bytes of pointer descriptor
+    */
+    uint64_t descriptor_offset = 0ull;
+    /**
+     Shape specifying dimension values.
+    */
+    struct Shape {
+        /**
+         Number of dimensions or rank or order.
+        */
+        uint32_t NumberOfDimensions = 0;
+        /**
+         array specifying value of each dimension.
+        Set all zeros for scalars.
+        */
+        uint32_t Dimensions[GNA_SHAPE_MAXIMUM_NUMBER_OF_DIMENSIONS] = {0};
+    } shape;
+    /**
+     * Blob layout
+     */
+    uint8_t layout = InferenceEngine::Layout::NC;
+    /**
+     * Blob precision
+     */
+    uint8_t precision = InferenceEngine::Precision::FP32;
+
+    intel_dnn_orientation_t orientation = kDnnUnknownOrientation;
+
+    RuntimeEndPoint() = default;
+    RuntimeEndPoint(const GNAPluginNS::Header2dot6::RuntimeEndPoint &old, uint32_t ngroup) {
+        scaleFactor = old.scaleFactor;
+        descriptor_ptr = old.descriptor_ptr;
+        element_size = old.element_size;
+        elements_count = old.elements_count;
+        orientation = old.orientation;
+        layout = InferenceEngine::Layout::NC;
+        precision = InferenceEngine::Precision::FP32;
+        descriptor_offset = old.descriptor_offset;
+        InferenceEngine::SizeVector dims = {ngroup, elements_count / ngroup};
+        shape.NumberOfDimensions = static_cast<uint32_t>(dims.size());
+        for (auto i = 0; i < dims.size(); i++) {
+            shape.Dimensions[i] = dims[i];
+        }
+    }
+    RuntimeEndPoint(double scaleFactor,
+                    void* descriptor_ptr,
+                    uint32_t element_size,
+                    uint32_t elements_count,
+                    Shape shape,
+                    uint8_t layout,
+                    uint8_t precision,
+                    intel_dnn_orientation_t orientation) : scaleFactor(scaleFactor),
+                                                           descriptor_ptr(descriptor_ptr),
+                                                           element_size(element_size),
+                                                           elements_count(elements_count),
+                                                           shape(shape),
+                                                           layout(layout),
+                                                           precision(precision),
+                                                           orientation(orientation) { }
+};
+} // namespace Header2dot7
+} // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
index 89292ab88af..7ec27b2caed 100644
--- a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include "serial/headers/2dot6/gna_model_header.hpp"
+#include "serial/headers/2dot7/gna_model_header.hpp"
 
 namespace GNAPluginNS {
 namespace HeaderLatest {
-using ModelHeader = GNAPluginNS::Header2dot6::ModelHeader;
-using RuntimeEndPoint = GNAPluginNS::Header2dot6::RuntimeEndPoint;
+using ModelHeader = GNAPluginNS::Header2dot7::ModelHeader;
+using RuntimeEndPoint = GNAPluginNS::Header2dot7::RuntimeEndPoint;
 }
 }
diff --git a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp
new file mode 100644
index 00000000000..da7e6279624
--- /dev/null
+++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.cpp
@@ -0,0 +1,180 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/convert_matmul_to_pointwise_convolution.hpp"
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "layers/gna_permute.hpp"
+#include "backend/gna_limitations.hpp"
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(ConvertMatmulToPointWiseConvolution, "ConvertMatmulToPointWiseConvolution", 0);
+NGRAPH_RTTI_DEFINITION(ConvertMatmulWithBiasToPointWiseConvolution, "ConvertMatmulWithBiasToPointWiseConvolution", 0);
+NGRAPH_RTTI_DEFINITION(ConvertMatmulWithFqToPointWiseConvolution, "ConvertMatmulWithFqToPointWiseConvolution", 0);
+
+static std::tuple<bool, uint32_t, uint32_t, uint32_t> VerifyAndGetConvParams(std::shared_ptr<ngraph::Node> matmul_node) {
+    auto input1_shape = matmul_node->get_input_shape(0);
+    auto input2_shape = matmul_node->get_input_shape(1);
+    auto output_shape = matmul_node->get_output_shape(0);
+    if (input1_shape.size() == 3 && input1_shape.front() == 1) {
+        input1_shape.erase(std::begin(input1_shape));
+    }
+
+    if (input1_shape.size() != 2 || input2_shape.size() != 2 || output_shape.size() < 2) {
+        return std::make_tuple(false, 0, 0, 0);
+    }
+
+    // Check if MatMul or corresponding pointwise convolution are supported by GNA
+    const uint32_t width = input1_shape.front();
+    const uint32_t in_channels = input2_shape.back();
+    const uint32_t out_channels = input2_shape.front();
+    if (input1_shape.front() <= GNALimitations::affineMaxBatchSize ||
+        out_channels % GNALimitations::convFiltersNumDivider != 0 ||
+        out_channels > GNALimitations::convMaxFiltersNum ||
+        in_channels > GNALimitations::convFilterMaxSize) {
+        return std::make_tuple(false, 0, 0, 0);
+    }
+
+    return std::make_tuple(true, width, in_channels, out_channels);
+}
+
+static bool Convert(std::shared_ptr<ngraph::Node> matmul_node,
+                    std::shared_ptr<ngraph::Node> add,
+                    std::shared_ptr<ngraph::Node> bias,
+                    std::shared_ptr<ngraph::Node> fq) {
+    bool supported;
+    uint32_t width, in_channels, out_channels;
+    std::tie(supported, width, in_channels, out_channels) = VerifyAndGetConvParams(matmul_node);
+    if (!supported) return false;
+
+    auto input_node = matmul_node->input_value(0).get_node_shared_ptr();
+    auto weights_node = matmul_node->input_value(1).get_node_shared_ptr();
+    auto base_name = matmul_node->get_friendly_name();
+
+    auto reshape_const_before = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+                                                                            ngraph::Shape{4},
+                                                                            ngraph::Shape{1, 1, width, in_channels});
+    auto reshape_before =  std::make_shared<ngraph::opset7::Reshape>(input_node, reshape_const_before, false);
+    reshape_before->set_friendly_name(base_name + "/reshape_in");
+
+    auto transpose_before = std::make_shared<ngraph::opset7::Transpose>(reshape_before,
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4},
+        GetPermuteOrder(InferenceEngine::Layout::NHWC, InferenceEngine::Layout::NCHW)));
+    transpose_before->set_friendly_name(base_name + "/transpose_in");
+
+    auto weights_reshape_const = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+        ngraph::Shape{4}, ngraph::Shape{out_channels, in_channels, 1, 1});
+    auto weights_reshaped =  std::make_shared<ngraph::opset7::Reshape>(weights_node, weights_reshape_const, false);
+
+    std::shared_ptr<ngraph::Node> conv_node = std::make_shared<ngraph::opset7::Convolution>(transpose_before, weights_reshaped,
+            ngraph::Strides{1, 1}, ngraph::CoordinateDiff{0, 0}, ngraph::CoordinateDiff{0, 0},
+            ngraph::Strides{1, 1}, ngraph::op::PadType::VALID);
+    conv_node->set_friendly_name(base_name + "/conv");
+
+    std::shared_ptr<ngraph::Node> root_node = matmul_node;
+    if (bias != nullptr) {
+         conv_node = std::make_shared<ngraph::opset7::Add>(conv_node, bias);
+         root_node = add;
+    }
+
+    if (fq != nullptr) {
+        conv_node = fq->clone_with_new_inputs({conv_node, fq->input_value(1), fq->input_value(2),
+            fq->input_value(3), fq->input_value(4)});
+        root_node = fq;
+    }
+
+    auto transpose_after = std::make_shared<ngraph::opset7::Transpose>(conv_node,
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{4},
+        GetPermuteOrder(InferenceEngine::Layout::NCHW, InferenceEngine::Layout::NHWC)));
+    transpose_after->set_friendly_name(base_name + "/transpose_out");
+
+    auto output_shape = matmul_node->get_output_shape(0);
+    output_shape[output_shape.size() - 1] = out_channels;
+    output_shape[output_shape.size() - 2] = width;
+    auto reshape_const_after = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+                                                                            ngraph::Shape{output_shape.size()},
+                                                                            output_shape);
+    auto reshape_after =  std::make_shared<ngraph::opset7::Reshape>(transpose_after, reshape_const_after, false);
+    reshape_after->set_friendly_name(base_name);
+
+    ngraph::replace_node(root_node, reshape_after);
+    return true;
+}
+
+ConvertMatmulToPointWiseConvolution::ConvertMatmulToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), nullptr, nullptr, nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(matmul, "ConvertMatmulToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
+
+ConvertMatmulWithBiasToPointWiseConvolution::ConvertMatmulWithBiasToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({matmul, bias});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(),
+            pattern_map.at(bias).get_node_shared_ptr(), nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(add, "ConvertMatmulWithBiasToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
+
+ConvertMatmulWithFqToPointWiseConvolution::ConvertMatmulWithFqToPointWiseConvolution() {
+    auto const_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto const_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({const_input,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+    auto second_input = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{const_input, const_fq});
+    auto matmul = ngraph::pattern::wrap_type<ngraph::opset7::MatMul>({ngraph::pattern::any_input(), second_input});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({matmul, bias});
+    auto matmul_out = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{add, matmul});
+    auto out_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({matmul_out,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto add_it = pattern_map.find(add);
+        auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr());
+        auto bias_it = pattern_map.find(bias);
+        auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr());
+        return Convert(pattern_map.at(matmul).get_node_shared_ptr(), add_node, bias_node,
+             pattern_map.at(out_fq).get_node_shared_ptr());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(out_fq, "ConvertMatmulWithFqToPointWiseConvolution");
+    this->register_matcher(m, callback);
+}
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp
new file mode 100644
index 00000000000..999b529194d
--- /dev/null
+++ b/inference-engine/src/gna_plugin/transformations/convert_matmul_to_pointwise_convolution.hpp
@@ -0,0 +1,71 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *                                                  |
+ *                                      Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulToPointWiseConvolution();
+};
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it, moved add with bias before the last transpose:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *       |                                         |
+ *      Add (const)                            Add (const)
+ *                                                 |
+ *                                      Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulWithBiasToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulWithBiasToPointWiseConvolution();
+};
+
+/**
+ * @brief Convert a MatMul with batch size unsupported by GNA to a point-wise convolution with NHWC layout
+ * with transposes around it, moved add with bias and/or fake quantize before the last transpose:
+ *                                      Transose (NHWC -> NCHW)
+ *                                                 |
+ * Matmul                               Convolution in NHWC layout
+ * Input1: [A, B] B > 8     ------->    Input: [1, 1, A, B]
+ * Input2: [B, C]                       Kernel: [C, B, 1, 1]
+ * Output: [A, C]                       Output: [1, 1, A, C]
+ *       |                                         |
+ *      Add (const)                            Add (const)
+ *       |                                         |
+ *     FakeQuantize                            FakeQuantize
+ *                                                 |
+ *                                         Transose (NCHW -> NHWC)
+ */
+class ConvertMatmulWithFqToPointWiseConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  ConvertMatmulWithFqToPointWiseConvolution();
+};
+
+} // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
new file mode 100644
index 00000000000..a9d79c831ab
--- /dev/null
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/split_convolution_with_large_buffer_size.hpp"
+
+#include <numeric>
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pattern/op/or.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+#include "backend/gna_limitations.hpp"
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(SplitConvolution, "SplitConvolution", 0);
+NGRAPH_RTTI_DEFINITION(SplitConvolutionWithBias, "SplitConvolutionWithBias", 0);
+NGRAPH_RTTI_DEFINITION(SplitConvolutionWithFq, "SplitConvolutionWithFq", 0);
+
+static std::vector<int64_t> GetConvSplitSizes(std::shared_ptr<ngraph::Node> conv) {
+    uint32_t width = conv->get_input_shape(0).back();
+    uint32_t in_channels = conv->get_input_shape(0).at(1);
+    uint32_t usedWidth = 0;
+    std::vector<int64_t> split_sizes;
+    uint32_t width_max_size = GNALimitations::bufferMaxSize / in_channels;
+    width_max_size = width_max_size - width_max_size % 64;
+    while (usedWidth < width) {
+        uint32_t width_part = std::min(width - usedWidth, width_max_size);
+        split_sizes.push_back(width_part);
+        usedWidth += width_part;
+    }
+    IE_ASSERT(usedWidth == width);
+    return split_sizes;
+}
+
+static bool Convert(std::shared_ptr<ngraph::Node> conv,
+                    std::shared_ptr<ngraph::Node> add,
+                    std::shared_ptr<ngraph::Node> bias,
+                    std::shared_ptr<ngraph::Node> fq) {
+    auto input_size = std::accumulate(std::begin(conv->get_input_shape(0)),
+        std::end(conv->get_input_shape(0)), 1, std::multiplies<size_t>());
+    if (input_size <= GNALimitations::bufferMaxSize) {
+        return false;
+    }
+
+    auto split_sizes = GetConvSplitSizes(conv);
+    IE_ASSERT(split_sizes.size() > 1);
+
+    /* TODO check if it's NHWC convolution wrapped with transposes or all input dimensions except of width == 1,
+        otherwise this split axis isn't supported */
+    const int64_t width_axis = conv->get_input_shape(0).size() - 1;
+    auto split_node = std::make_shared<ngraph::opset7::VariadicSplit>(conv->input_value(0),
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{width_axis}),
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_sizes.size()}), split_sizes));
+    split_node->set_friendly_name(conv->get_friendly_name() + "/split");
+    ngraph::OutputVector convOutputs;
+    std::shared_ptr<ngraph::Node> root_node = fq ? fq : (add ? add : conv);
+    for (int i = 0; i < split_sizes.size(); ++i) {
+        std::shared_ptr<ngraph::Node> output = conv->clone_with_new_inputs({split_node->output(i), conv->input_value(1)});
+        output->set_friendly_name(conv->get_friendly_name() + "_" + std::to_string(i));
+        if (bias) {
+            output = std::make_shared<ngraph::opset7::Add>(output, bias);
+        }
+
+        if (fq) {
+            output = fq->clone_with_new_inputs({output, fq->input_value(1), fq->input_value(2),
+                fq->input_value(3), fq->input_value(4)});
+        }
+        convOutputs.push_back(output);
+    }
+
+    auto concat = std::make_shared<ngraph::opset7::Concat>(convOutputs, width_axis);
+    concat->set_friendly_name(conv->get_friendly_name());
+    ngraph::replace_node(root_node, concat);
+    return true;
+}
+
+SplitConvolution::SplitConvolution() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), nullptr, nullptr, nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(conv, "SplitConvolution");
+    this->register_matcher(m, callback);
+}
+
+SplitConvolutionWithBias::SplitConvolutionWithBias() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({conv, bias});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), pattern_map.at(add).get_node_shared_ptr(),
+            pattern_map.at(bias).get_node_shared_ptr(), nullptr);
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(add, "SplitConvolutionWithBias");
+    this->register_matcher(m, callback);
+}
+
+SplitConvolutionWithFq::SplitConvolutionWithFq() {
+    auto conv = ngraph::pattern::wrap_type<ngraph::opset7::Convolution>({ngraph::pattern::any_input(),
+        ngraph::pattern::any_input()});
+    auto bias = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto add = ngraph::pattern::wrap_type<ngraph::opset7::Add>({conv, bias});
+    auto conv_output = std::make_shared<ngraph::pattern::op::Or>(ngraph::OutputVector{conv, add});
+    auto out_fq = ngraph::pattern::wrap_type<ngraph::opset7::FakeQuantize>({conv_output,
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>(),
+        ngraph::pattern::wrap_type<ngraph::opset7::Constant>()});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto add_it = pattern_map.find(add);
+        auto add_node = (add_it == std::end(pattern_map) ? nullptr : add_it->second.get_node_shared_ptr());
+        auto bias_it = pattern_map.find(bias);
+        auto bias_node = (bias_it == std::end(pattern_map) ? nullptr : bias_it->second.get_node_shared_ptr());
+        return Convert(pattern_map.at(conv).get_node_shared_ptr(), add_node, bias_node, pattern_map.at(out_fq).get_node_shared_ptr());
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(out_fq, "SplitConvolutionWithFq");
+    this->register_matcher(m, callback);
+}
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp
new file mode 100644
index 00000000000..8667f4273bf
--- /dev/null
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.hpp
@@ -0,0 +1,34 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+// @brief Splits convolution with large input buffer
+class SplitConvolution : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolution();
+};
+
+// @brief Splits convolution with large input buffer, move add with bias to each convolution before concat
+class SplitConvolutionWithBias : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolutionWithBias();
+};
+
+/* @brief Splits convolution with large input buffer,
+ * move add with bias and/or fake quantize to each convolution before concat
+ */
+class SplitConvolutionWithFq : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  SplitConvolutionWithFq();
+};
+
+} // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
index 9f0135aa25e..994ba866f7a 100644
--- a/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
+++ b/inference-engine/src/hetero_plugin/hetero_executable_network.cpp
@@ -312,6 +312,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
     struct Subgraph {
         ngraph::ResultVector    _results;
         ngraph::ParameterVector _parameters;
+        ngraph::SinkVector      _sinks;
         std::string             _affinity;
     };
     std::unordered_map<int, Subgraph> subgraphs;
@@ -325,6 +326,9 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
         } else if (ngraph::op::is_parameter(node)) {
             subgraph._parameters.emplace_back(
                 std::dynamic_pointer_cast<ngraph::op::v0::Parameter>(node->shared_from_this()));
+        } else if (ngraph::op::is_sink(node)) {
+            subgraph._sinks.emplace_back(
+                    std::dynamic_pointer_cast<ngraph::op::Sink>(node->shared_from_this()));
         }
         auto itAffinity = affinities.find(node);
         if (itAffinity != affinities.end()) {
@@ -373,7 +377,7 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(const InferenceEngine::CNNNetwo
     for (auto&& subgraph : orderedSubgraphs) {
         _networks[id]._device = subgraph._affinity;
         subFunctions[id] =
-            std::make_shared<ngraph::Function>(subgraph._results, subgraph._parameters,
+            std::make_shared<ngraph::Function>(subgraph._results, subgraph._sinks, subgraph._parameters,
                                                      _name + '_' + std::to_string(id));
         _networks[id]._clonedNetwork = CNNNetwork{subFunctions[id]};
         // update of pre-processing info
diff --git a/inference-engine/src/transformations/include/transformations/op_conversions/gather_normalize_negative_indices.hpp b/inference-engine/src/transformations/include/transformations/op_conversions/gather_normalize_negative_indices.hpp
new file mode 100644
index 00000000000..1ec1ffe628e
--- /dev/null
+++ b/inference-engine/src/transformations/include/transformations/op_conversions/gather_normalize_negative_indices.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <transformations_visibility.hpp>
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace ngraph {
+namespace pass {
+
+    class TRANSFORMATIONS_API GatherNegativeConstIndicesNormalize;
+
+}  // namespace pass
+}  // namespace ngraph
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief GatherNegativeConstIndicesNormalize checks if indices value is negative scalar and
+ * normalizes it using ShapeOf->Add->Cast subgraph.
+ * We need to remove this transformation after adding support of negative indices in
+ * future version of Gather operation.
+ */
+class ngraph::pass::GatherNegativeConstIndicesNormalize : public ngraph::pass::MatcherPass {
+public:
+    NGRAPH_RTTI_DECLARATION;
+    GatherNegativeConstIndicesNormalize();
+};
diff --git a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
index 79f1dee8882..4ab5cf1e80d 100644
--- a/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
+++ b/inference-engine/src/transformations/src/transformations/common_optimizations/common_optimizations.cpp
@@ -70,6 +70,7 @@
 #include "transformations/op_conversions/log_softmax_decomposition.hpp"
 #include "transformations/op_conversions/mvn6_decomposition.hpp"
 #include "transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp"
+#include "transformations/op_conversions/gather_normalize_negative_indices.hpp"
 
 #include <ngraph/pass/manager.hpp>
 #include <ngraph/pass/constant_folding.hpp>
@@ -157,6 +158,7 @@ bool ngraph::pass::CommonOptimizations::run_on_function(std::shared_ptr<ngraph::
     decomp->add_matcher<ngraph::pass::MVN6Decomposition>();
     decomp->add_matcher<ngraph::pass::SimplifyCTCGreedyDecoderSeqLen>();
     decomp->add_matcher<ngraph::pass::EinsumDecomposition>();
+    decomp->add_matcher<ngraph::pass::GatherNegativeConstIndicesNormalize>();
     decomp->set_name("ngraph::pass::CommonDecompositions");
 
     // CF is required after all decompositions
diff --git a/inference-engine/src/transformations/src/transformations/op_conversions/gather_normalize_negative_indices.cpp b/inference-engine/src/transformations/src/transformations/op_conversions/gather_normalize_negative_indices.cpp
new file mode 100644
index 00000000000..86713451869
--- /dev/null
+++ b/inference-engine/src/transformations/src/transformations/op_conversions/gather_normalize_negative_indices.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/op_conversions/gather_normalize_negative_indices.hpp"
+
+#include <memory>
+
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/rt_info.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include "itt.hpp"
+
+NGRAPH_RTTI_DEFINITION(ngraph::pass::GatherNegativeConstIndicesNormalize, "GatherNegativeConstIndicesNormalize", 0);
+
+ngraph::pass::GatherNegativeConstIndicesNormalize::GatherNegativeConstIndicesNormalize() {
+    MATCHER_SCOPE(GatherNegativeConstIndicesNormalize);
+    auto data_input = ngraph::pattern::any_input(pattern::has_static_rank());
+    auto axis_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto indices_input = ngraph::pattern::wrap_type<ngraph::opset7::Constant>();
+    auto gather_node = std::make_shared<ngraph::opset7::Gather>(data_input, indices_input, axis_input);
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        auto& pattern_to_output = m.get_pattern_value_map();
+        auto gather = std::dynamic_pointer_cast<ngraph::opset7::Gather>(pattern_to_output.at(gather_node).get_node_shared_ptr());
+        auto data = pattern_to_output.at(data_input);
+        auto axis_constant = std::dynamic_pointer_cast<ngraph::opset7::Constant>(pattern_to_output.at(axis_input).get_node_shared_ptr());
+        auto indices_constant = std::dynamic_pointer_cast<ngraph::opset7::Constant>(pattern_to_output.at(indices_input).get_node_shared_ptr());
+
+        if (!gather || !axis_constant || !indices_constant) {
+            return false;
+        }
+
+        auto indices = indices_constant->cast_vector<int64_t>();
+        if (indices.size() != 1 || indices[0] >= 0) {
+            return false;
+        }
+
+        auto axis = axis_constant->cast_vector<int64_t>();
+        if (axis.size() != 1) {
+            return false;
+        }
+
+        auto axis_value = axis[0];
+
+        // normalize `axis` value if it is negative
+        if (axis_value < 0) {
+            axis_value = axis_value + data.get_partial_shape().rank().get_length();
+        }
+
+        if (data.get_partial_shape().rank().get_length() < axis_value) {
+            return false;
+        }
+
+        // check `axis` dimension of data tensor is static
+        if (!data.get_partial_shape()[axis_value].is_static()) {
+            return false;
+        }
+
+        auto input_type = indices_constant->get_element_type();
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, input_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(input_type, Shape{}, {axis_value}), ngraph::opset7::Constant::create(input_type, Shape{}, {0}));
+
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices_constant);
+        auto gather_new = gather_node->copy_with_new_inputs({data, add, axis_constant});
+        gather_new->set_friendly_name(gather->get_friendly_name());
+
+        ngraph::copy_runtime_info(gather, {shape_of, input_gather, add, gather_new});
+        ngraph::replace_node(gather, gather_new);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(gather_node, matcher_name);
+    register_matcher(m, callback);
+}
diff --git a/inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp
index 213e06ee1f5..27e703ec4f5 100644
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp
@@ -92,7 +92,7 @@ bool checkGrowingOutput(const Model& model) {
         return false;
     }
 
-    static const float SCALE_THRESHOLD = 0.125f;
+    static const float SCALE_THRESHOLD = 0.1f;
 
     for (const auto& stage : model->getStages()) {
         if (stage->type() != StageType::Power &&
@@ -248,14 +248,13 @@ void PassImpl::run(const Model& model) {
                 if (firstStage && shift < 4 && isGrowingOutput && weights->desc().dim(Dim::C) > 1) {
                     normalVal = 5;
                 }
-
                 shift = correctShift(shift, firstStage, stage->origLayer()->type);
                 shift -= normalVal;
             }
 
             firstStage = false;
             scale = 1;
-            if (shift > scaleThreshold) {
+            if (shift >= scaleThreshold) {
                 scale = static_cast<float>(1ULL << static_cast<std::uint32_t>(shift));
             }
 
diff --git a/inference-engine/tests/functional/inference_engine/transformations/gather_normalize_negative_indices_test.cpp b/inference-engine/tests/functional/inference_engine/transformations/gather_normalize_negative_indices_test.cpp
new file mode 100644
index 00000000000..ec6c4204a9b
--- /dev/null
+++ b/inference-engine/tests/functional/inference_engine/transformations/gather_normalize_negative_indices_test.cpp
@@ -0,0 +1,306 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset7.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <transformations/op_conversions/gather_normalize_negative_indices.hpp>
+#include <transformations/init_node_info.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+
+using namespace testing;
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_neg_axis) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+             ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_dif_input_types) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{1, 15, 128});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape{}, {1});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_static_axis_dim) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_static_axis_dim_neg_axis) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, 15, DYN});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-2});
+
+        auto shape_of = std::make_shared<ngraph::opset7::ShapeOf>(data, indices_type);
+        auto input_gather = std::make_shared<ngraph::opset7::Gather>(shape_of,
+            ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {1}), ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {0}));
+        auto add = std::make_shared<ngraph::opset7::Add>(input_gather, indices);
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, add, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_non_static_axis_dim) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, DYN, DYN});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto indices_type = ngraph::element::i32;
+
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape{DYN, DYN, DYN});
+        auto indices = ngraph::opset7::Constant::create(indices_type, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_positive_ind) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{2, 3});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::Shape{2, 3});
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
+
+TEST(TransformationTests, GatherNegativeIndicesNormalize_non_static_rank) {
+    std::shared_ptr<ngraph::Function> f(nullptr), f_ref(nullptr);
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape::dynamic(ngraph::Rank::dynamic()));
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis, 0);
+
+        f = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+
+        ngraph::pass::Manager manager;
+        manager.register_pass<ngraph::pass::InitNodeInfo>();
+        manager.register_pass<ngraph::pass::GatherNegativeConstIndicesNormalize>();
+        manager.run_passes(f);
+        ASSERT_NO_THROW(check_rt_info(f));
+    }
+
+    {
+        auto data = std::make_shared<ngraph::opset7::Parameter>(ngraph::element::f32, ngraph::PartialShape::dynamic());
+        auto indices = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {-1});
+        auto axis = ngraph::opset7::Constant::create(ngraph::element::i32, ngraph::Shape{}, {0});
+
+        auto gather = std::make_shared<ngraph::opset7::Gather>(data, indices, axis);
+
+        f_ref = std::make_shared<ngraph::Function>(ngraph::NodeVector{gather}, ngraph::ParameterVector{data});
+    }
+
+    auto res = compare_functions(f, f_ref);
+    ASSERT_TRUE(res.first) << res.second;
+}
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/memory.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/memory.cpp
index 9ab20c3eda4..062ea0cad91 100644
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/memory.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/memory.cpp
@@ -38,7 +38,7 @@ INSTANTIATE_TEST_CASE_P(smoke_MemoryTest, MemoryTest,
                 ::testing::ValuesIn(iterationCount),
                 ::testing::ValuesIn(inShapes),
                 ::testing::ValuesIn(inputPrecisions),
-                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU, "HETERO:CPU")),
         MemoryTest::getTestCaseName);
 
 } // namespace
diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/convert_matmul_to_pointwise_conv.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/convert_matmul_to_pointwise_conv.cpp
new file mode 100644
index 00000000000..7e3d15174f3
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/convert_matmul_to_pointwise_conv.cpp
@@ -0,0 +1,230 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+#include "ngraph_functions/pass/convert_prc.hpp"
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::vector<size_t>                 // Input Shape
+> convertMatmulToPointwiseConvParams;
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::vector<size_t>,                // Input Shape
+    std::pair<float, float>             // Input Min and Max
+> convertMatmulToPointwiseConvWithFqParams;
+
+namespace LayerTestsDefinitions {
+
+class ConvertMatmulToPointwiseConv : public testing::WithParamInterface<convertMatmulToPointwiseConvParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<convertMatmulToPointwiseConvParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
+        return result.str();
+    }
+
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const {
+        InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+        blob->allocate();
+
+        auto* rawBlobDataPtr = blob->buffer().as<float*>();
+        std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -0.2f, 0.2f);
+        for (size_t i = 0; i < blob->size(); i++) {
+            rawBlobDataPtr[i] = values[i];
+        }
+        return blob;
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = this->GetParam();
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+        size_t batch = inputShape[inputShape.size() - 2];
+        size_t elemNum = inputShape[inputShape.size() - 1];
+        std::vector<float> weights = CommonTestUtils::generate_float_numbers(elemNum * elemNum, -0.1f, 0.1f);
+        auto weightsNode = std::make_shared<ngraph::opset7::Constant>(ngPrc, ngraph::Shape{elemNum, elemNum}, weights);
+        auto matmul = ngraph::builder::makeMatMul(params[0], weightsNode, false, true);
+
+        auto bias = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{1, batch, 1}, std::vector<float>{1.0f});
+        auto add = ngraph::builder::makeEltwise(matmul, bias, ngraph::helpers::EltwiseTypes::ADD);
+
+        auto pattern = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+            ngraph::Shape{ inputShape.size() }, inputShape);
+        auto reshape = std::make_shared<ngraph::opset7::Reshape>(matmul, pattern, false);
+        auto relu = std::make_shared<ngraph::opset7::Relu>(reshape);
+
+        ngraph::ResultVector results{ std::make_shared<ngraph::opset7::Result>(relu)};
+        function = std::make_shared<ngraph::Function>(results, params, "ConvertMatmulToPointwiseConv");
+    }
+};
+
+class ConvertMatmulToPointwiseConvWithFq : public testing::WithParamInterface<convertMatmulToPointwiseConvWithFqParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+    float inputDataMin = -10.0f;
+    float inputDataMax = 10.0f;
+    float inputDataResolution = 1.0f;
+
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<convertMatmulToPointwiseConvWithFqParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::vector<size_t> inputShape;
+        std::pair<float, float> inputMinMax;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
+        result << "_inputMinMax=(" << inputMinMax.first << ".." << inputMinMax.second << ")";
+        return result.str();
+    }
+
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const {
+        return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin,
+            1 / inputDataResolution);
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+        std::vector<size_t> inputShape;
+        std::pair<float, float> inputMinMax;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax) = this->GetParam();
+        std::tie(inputDataMin, inputDataMax) = inputMinMax;
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+        auto inputLowNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ inputDataMin });
+        auto inputHighNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ inputDataMax });
+        auto inputFQ = std::make_shared<ngraph::opset7::FakeQuantize>(params[0],
+            inputLowNode, inputHighNode, inputLowNode, inputHighNode, UINT16_MAX);
+
+        size_t elemNum = inputShape[inputShape.size() - 1];
+
+        const float weightsMin = -0.2f;
+        const float weightsMax = 0.2f;
+        std::vector<float> weights = CommonTestUtils::generate_float_numbers(elemNum * elemNum, weightsMin, weightsMax);
+        auto weightsNode = std::make_shared<ngraph::opset7::Constant>(ngPrc, ngraph::Shape{elemNum, elemNum}, weights);
+        auto weightsLowNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ weightsMin });
+        auto weightsHighNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+              std::vector<float>{ weightsMax });
+        auto weightsFQNode = std::make_shared<ngraph::opset7::FakeQuantize>(weightsNode,
+            weightsLowNode, weightsHighNode, weightsLowNode, weightsHighNode, UINT16_MAX);
+        auto matmul = ngraph::builder::makeMatMul(inputFQ, weightsFQNode, false, true);
+
+        auto bias = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{1, 1, 1}, std::vector<float>{1.0f});
+        auto add = ngraph::builder::makeEltwise(matmul, bias, ngraph::helpers::EltwiseTypes::ADD);
+
+        auto outputLowNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ -inputDataMax * weightsMax *  elemNum });
+        auto outputHighNode = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{ 1 },
+            std::vector<float>{ inputDataMax * weightsMax * elemNum });
+        auto outputFQ = std::make_shared<ngraph::opset7::FakeQuantize>(add,
+            outputLowNode, outputHighNode, outputLowNode, outputHighNode, UINT16_MAX);
+
+        auto pattern = std::make_shared<ngraph::opset7::Constant>(ngraph::element::Type_t::i64,
+            ngraph::Shape{ inputShape.size() }, inputShape);
+        auto reshape = std::make_shared<ngraph::opset7::Reshape>(outputFQ, pattern, false);
+
+        auto relu = std::make_shared<ngraph::opset7::Relu>(reshape);
+
+        ngraph::ResultVector results{ std::make_shared<ngraph::opset7::Result>(relu)};
+        function = std::make_shared<ngraph::Function>(results, params, "ConvertMatmulToPointwiseConv");
+    }
+};
+
+TEST_P(ConvertMatmulToPointwiseConv, CompareWithRefImpl) {
+    Run();
+};
+
+TEST_P(ConvertMatmulToPointwiseConvWithFq, CompareWithRefImpl) {
+    Run();
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
+    }
+};
+
+const std::vector<std::vector<size_t>> inputShape = {
+    {1, 64, 64},
+    {1, 256, 128},
+    {1, 512, 128}
+};
+
+const std::vector<std::pair<float, float>> fqStats = {
+    {-0.5, 0.5}
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_ConvertMatmulToPointwiseConvTest, ConvertMatmulToPointwiseConv,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(inputShape)),
+    ConvertMatmulToPointwiseConv::getTestCaseName);
+
+// Issue 55662
+INSTANTIATE_TEST_CASE_P(DISABLED_smoke_ConvertMatmulToPointwiseConvTest, ConvertMatmulToPointwiseConvWithFq,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(inputShape),
+        ::testing::ValuesIn(fqStats)),
+    ConvertMatmulToPointwiseConvWithFq::getTestCaseName);
+
+} // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
index 17da73dfc99..f4c6cc98d34 100644
--- a/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
@@ -18,19 +18,21 @@
 typedef std::tuple<
         InferenceEngine::Precision,         // Network Precision
         std::string,                        // Target Device
-        std::map<std::string, std::string>  //Configuration
+        std::map<std::string, std::string>, // Configuration
+        std::vector<size_t>                 // Input Shape
 > EltwiseSplitOverChannelsPassParams;
 
 namespace LayerTestsDefinitions {
 
 class EltwiseSplitOverChannelsPassTest : public testing::WithParamInterface<EltwiseSplitOverChannelsPassParams>,
-                                             public LayerTestsUtils::LayerTestsCommon {
+                                         public LayerTestsUtils::LayerTestsCommon {
 public:
     static std::string getTestCaseName(testing::TestParamInfo<EltwiseSplitOverChannelsPassParams> obj) {
         InferenceEngine::Precision netPrecision;
         std::string targetDevice;
         std::map<std::string, std::string> configuration;
-        std::tie(netPrecision, targetDevice, configuration) = obj.param;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = obj.param;
 
         std::ostringstream result;
         result << "netPRC=" << netPrecision.name() << "_";
@@ -38,20 +40,22 @@ public:
         for (auto const& configItem : configuration) {
             result << "_configItem=" << configItem.first << "_" << configItem.second;
         }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
         return result.str();
     }
 
 protected:
     void SetUp() override {
         InferenceEngine::Precision netPrecision;
-        std::tie(netPrecision, targetDevice, configuration) = this->GetParam();
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, inputShape) = this->GetParam();
         auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
 
-        auto params = ngraph::builder::makeParams(ngPrc, { {1, 67000} });
-        auto const_mult2 = ngraph::builder::makeConstant<float>(ngPrc, {1, 67000}, {-1.0f});
+        auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+        auto const_mult2 = ngraph::builder::makeConstant<float>(ngPrc, inputShape, {-1.0f});
 
         auto sum = ngraph::builder::makeEltwise(params[0], const_mult2, ngraph::helpers::EltwiseTypes::MULTIPLY);
-        function = std::make_shared<ngraph::Function>(sum, params, "RemovePermutationPass");
+        function = std::make_shared<ngraph::Function>(sum, params, "EltwiseSplitOverChannelsPassTest");
     }
 };
 
@@ -71,11 +75,17 @@ const std::vector<std::map<std::string, std::string>> configs = {
         }
 };
 
+const std::vector<std::vector<size_t>> inputShape = {
+    {1, 67000},
+    {1, 500000}
+};
+
 INSTANTIATE_TEST_CASE_P(smoke_EltwiseSplitOverChennels, EltwiseSplitOverChannelsPassTest,
                         ::testing::Combine(
                                 ::testing::ValuesIn(netPrecisions),
                                 ::testing::Values(CommonTestUtils::DEVICE_GNA),
-                                ::testing::ValuesIn(configs)),
+                                ::testing::ValuesIn(configs),
+                                ::testing::ValuesIn(inputShape)),
                         EltwiseSplitOverChannelsPassTest::getTestCaseName);
 
 } // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
index cb4cc459a95..a59ad83eaed 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
@@ -60,8 +60,6 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*ConstantResultSubgraphTest.*inPrc=(U8|I8|I32|U64|I64|BOOL).*)",
         // TODO: Issue 51528
         R"(.*CachingSupport.*_(u8|i16)_.*)",
-        // TODO: Issue 51527
-        R"(.*CachingSupport.*_batch2_.*)",
         // TODO: Issue 51525
         R"(.*CachingSupport.*KSOFunction.*)",
         // TODO: Issue 57363 (Param -> Result subgraphs)
diff --git a/inference-engine/tests/functional/plugin/shared/src/base/import_export_base/import_export_base.cpp b/inference-engine/tests/functional/plugin/shared/src/base/import_export_base/import_export_base.cpp
index 0db7264cb74..c30945dc914 100644
--- a/inference-engine/tests/functional/plugin/shared/src/base/import_export_base/import_export_base.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/base/import_export_base/import_export_base.cpp
@@ -69,13 +69,16 @@ void ImportNetworkTestBase::Run() {
 
     for (const auto& next_input : importedExecNetwork.GetInputsInfo()) {
         ASSERT_NO_THROW(compiledExecNetwork.GetInputsInfo()[next_input.first]);
+        Compare(next_input.second->getTensorDesc(), compiledExecNetwork.GetInputsInfo()[next_input.first]->getTensorDesc());
     }
     for (const auto& next_output : importedExecNetwork.GetOutputsInfo()) {
         ASSERT_NO_THROW(compiledExecNetwork.GetOutputsInfo()[next_output.first]);
     }
     auto importedOutputs = GetOutputs();
     ASSERT_EQ(actualOutputs.size(), importedOutputs.size());
+
     for (size_t i = 0; i < actualOutputs.size(); i++) {
+        Compare(actualOutputs[i]->getTensorDesc(), importedOutputs[i]->getTensorDesc());
         Compare(actualOutputs[i], importedOutputs[i]);
     }
 }
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
index 9b8b78b6ef0..9d132515743 100644
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/base/layer_test_utils.hpp
@@ -72,6 +72,8 @@ public:
 
     virtual void Compare(const InferenceEngine::Blob::Ptr &expected, const InferenceEngine::Blob::Ptr &actual);
 
+    virtual void Compare(const InferenceEngine::TensorDesc &actualDesc, const InferenceEngine::TensorDesc &expectedDesc);
+
     virtual void SetRefMode(RefMode mode);
 
     std::shared_ptr<ngraph::Function> GetFunction();
diff --git a/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp b/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
index cc3927b25c5..056826aff86 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/base/layer_test_utils.cpp
@@ -274,6 +274,17 @@ void LayerTestsCommon::Compare(const InferenceEngine::Blob::Ptr &expected, const
     }
 }
 
+void LayerTestsCommon::Compare(const InferenceEngine::TensorDesc &actualDesc, const InferenceEngine::TensorDesc &expectedDesc) {
+    auto expectedDims = actualDesc.getDims();
+    auto actualDims = expectedDesc.getDims();
+    ASSERT_EQ(actualDims.size(), expectedDims.size());
+    for (size_t j = 0; j < actualDims.size(); ++j) {
+        ASSERT_EQ(actualDims.at(j), expectedDims.at(j));
+    }
+    ASSERT_EQ(actualDesc.getLayout(), expectedDesc.getLayout());
+    ASSERT_EQ(actualDesc.getPrecision(), expectedDesc.getPrecision());
+}
+
 void LayerTestsCommon::ConfigureNetwork() {
     for (const auto &in : cnnNetwork.getInputsInfo()) {
         if (inLayout != InferenceEngine::Layout::ANY) {
diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp
index ef988cb12da..97eacdf9ae1 100644
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -1176,9 +1176,6 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
     size_t opt_deconv_layers_b_fs_zyx_fsv16 = 0;
     size_t total_crop_layers = 0;
 
-    size_t weighted_sum_feature_size = 0;
-    size_t weight_sum = 0;
-
     for (auto& node : get_processing_order()) {
         auto &prim = *node;
         if (prim.type() == cldnn::convolution::type_id()) {
@@ -1324,35 +1321,4 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
 
     if (should_use_bs_fs_yx_bsv16_fsv16)
         lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bs_fs_yx_bsv16_fsv16_network, 1);
-
-
-    // This is to avoid using fsv16 for shallow-feature networks.
-    // This may not be exactly same as real execution graph as layer fusing is not done yet,
-    // but it is a reasonable approximation.
-    // Check the expected network efficiency after setting layer optimization attributes.
-    // If network depth is shallow, it is faster with fsv4.
-    for (auto& node : get_processing_order()) {
-        auto &prim = *node;
-
-        if (prim.is_in_data_flow() && prim.type() == cldnn::convolution::type_id()) {
-            size_t num_feature = prim.get_output_layout().size.feature.vector()[0];
-            size_t num_spatial = 1;
-            for (auto s : prim.get_output_layout().size.spatial.vector())
-                num_spatial *= s;
-
-            if (lo.get_preferred_format(prim) != format::b_fs_yx_fsv4) {
-                weight_sum += num_spatial;
-                weighted_sum_feature_size += num_spatial * num_feature;
-            }
-        }
-    }
-
-    size_t weighted_average_feature_depth = weighted_sum_feature_size / std::max(weight_sum, static_cast<size_t>(1));
-
-    // Need to confirm that weighted_average_feature_depth > 1 to keep unittest behavior.
-    if (is_quantized_int8_model && weighted_average_feature_depth < 8 && weighted_average_feature_depth > 1) {
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::fs_b_yx_fsv32_network, 0);
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::b_fs_yx_fsv16_network, 0);
-        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::bs_fs_yx_bsv16_fsv16_network, 0);
-    }
 }
diff --git a/ngraph/core/include/ngraph/op/util/op_types.hpp b/ngraph/core/include/ngraph/op/util/op_types.hpp
index b672f5518c4..6d162157ab5 100644
--- a/ngraph/core/include/ngraph/op/util/op_types.hpp
+++ b/ngraph/core/include/ngraph/op/util/op_types.hpp
@@ -34,6 +34,8 @@ namespace ngraph
         NGRAPH_API
         bool is_output(const ngraph::Node* node);
         NGRAPH_API
+        bool is_sink(const ngraph::Node* node);
+        NGRAPH_API
         bool is_constant(const ngraph::Node* node);
         NGRAPH_API
         bool is_commutative(const ngraph::Node* node);
@@ -60,6 +62,8 @@ namespace ngraph
         NGRAPH_API
         bool is_output(const std::shared_ptr<ngraph::Node>& node);
         NGRAPH_API
+        bool is_sink(const std::shared_ptr<ngraph::Node>& node);
+        NGRAPH_API
         bool is_constant(const std::shared_ptr<ngraph::Node>& node);
         NGRAPH_API
         bool is_commutative(const std::shared_ptr<ngraph::Node>& node);
diff --git a/ngraph/core/src/op/util/op_types.cpp b/ngraph/core/src/op/util/op_types.cpp
index f0852233ec7..354c605ced1 100644
--- a/ngraph/core/src/op/util/op_types.cpp
+++ b/ngraph/core/src/op/util/op_types.cpp
@@ -76,6 +76,11 @@ bool ngraph::op::is_output(const ngraph::Node* node)
     return dynamic_cast<const ngraph::op::Result*>(node) != nullptr;
 }
 
+bool ngraph::op::is_sink(const ngraph::Node* node)
+{
+    return dynamic_cast<const ngraph::op::Sink*>(node) != nullptr;
+}
+
 bool ngraph::op::is_constant(const ngraph::Node* node)
 {
     return dynamic_cast<const ngraph::op::Constant*>(node) != nullptr;
@@ -134,6 +139,10 @@ bool ngraph::op::is_output(const std::shared_ptr<ngraph::Node>& node)
 {
     return is_output(node.get());
 }
+bool ngraph::op::is_sink(const std::shared_ptr<ngraph::Node>& node)
+{
+    return is_sink(node.get());
+}
 bool ngraph::op::is_constant(const std::shared_ptr<ngraph::Node>& node)
 {
     return is_constant(node.get());
diff --git a/ngraph/python/tox.ini b/ngraph/python/tox.ini
index e0ccc85785e..de7bb8337b2 100644
--- a/ngraph/python/tox.ini
+++ b/ngraph/python/tox.ini
@@ -7,7 +7,7 @@ skip_install=True
 deps =
   -rrequirements.txt
   -rrequirements_test.txt
-  mypy
+  mypy<0.900
   flake8-bugbear
   pytest-xdist
 setenv =
diff --git a/ngraph/test/CMakeLists.txt b/ngraph/test/CMakeLists.txt
index 110d57c8b1d..eb6d83f0d70 100644
--- a/ngraph/test/CMakeLists.txt
+++ b/ngraph/test/CMakeLists.txt
@@ -288,7 +288,7 @@ set_source_files_properties(includes.cpp PROPERTIES COMPILE_DEFINITIONS
 if (ENABLE_MKL_DNN)
     message(STATUS "NGRAPH_TESTS: IE:CPU enabled")
     set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} "IE:CPU")
-    if (NOT ENABLE_STRICT_DEPENDENCIES)
+    if (ENABLE_STRICT_DEPENDENCIES)
         # For convinience add a runtime dependency to build along with this target.
         # Warning: Parallel build with -GNinja may not be efficient.
         list(APPEND UNIT_TESTS_DEPENDENCIES MKLDNNPlugin)
@@ -298,7 +298,7 @@ endif()
 if (ENABLE_CLDNN)
     message(STATUS "NGRAPH_TESTS: IE:GPU enabled")
     set(ACTIVE_BACKEND_LIST ${ACTIVE_BACKEND_LIST} "IE:GPU")
-    if (NOT ENABLE_STRICT_DEPENDENCIES)
+    if (ENABLE_STRICT_DEPENDENCIES)
         # For convinience add a runtime dependency to build along with this target.
         # Warning: Parallel build with -GNinja may not be efficient.
         list(APPEND UNIT_TESTS_DEPENDENCIES clDNNPlugin)