[GNA] 1D convolution support for native NCHW models (#4067)

2021-02-05 14:55:11 +03:00
parent 08a527602f
commit 4393525313
27 changed files with 1980 additions and 235 deletions
--- a/inference-engine/src/gna_plugin/gna_data_types.hpp
+++ b/inference-engine/src/gna_plugin/gna_data_types.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -20,6 +20,14 @@
 #define FROM_IR_DIM(mem, idx)\
 ((mem->getTensorDesc().getDims().size() > (idx) - 1) ? mem->getTensorDesc().getDims()[mem->getTensorDesc().getDims().size() - (idx)] : 1)

+struct TranspositionInfo {
+    bool transpose;
+    size_t num_transpose_rows;
+    size_t num_transpose_columns;
+};
+
+using TranspositionInfoMap = std::map<std::string, std::vector<TranspositionInfo>>;
+
 namespace GNAPluginNS {
 #if  GNA_LIB_VER == 2
    using dnn_ptr = std::shared_ptr<CPPWrapper<Gna2Model>>;
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -435,10 +435,7 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
        currentComponent.orientation_out = kDnnInterleavedOrientation;
    }

-    size_t num_data_bytes_out =
-        InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims()))
-        * outputs->getPrecision().size();
-
+    size_t num_data_bytes_out = num_columns_out * outputs->getPrecision().size();
    size_t num_data_bytes_in = (num_inputs + num_input_padding) * inputs->getPrecision().size();

    auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input;
@@ -457,30 +454,6 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP

    connectOutput(layer, ptr_outputs, num_data_bytes_out);

-    // When there's a NCHW convolution as a last layer, the output needs to be transposed back to NCHW
-    // TODO: Jira: 43659 - the issue also appears when after conv there's an eltwise or activation
-    // For last layer or when next ones are only non functional, the data can be reordered when exporting scores
-    // For other cases inserting permute is required if data are reordered
-    auto isNonFunctional = [](CNNLayerPtr l) {
-        return LayerInfo(l).isNonFunctional();
-    };
-    if (getInputTo(outputs).empty() || !CNNNetHasNextLayerSkipCertain(layer, 0, 0, isNonFunctional)) {
-        // if height dim and width dim both equal 1, the permute is not needed to return correct results
-        // if height dim doesn't equal 1, the case requires additional permute
-        auto inputDimsCheck = (outputs->getLayout() == Layout::NHWC ||
-                               in_channels != 1 ||
-                               (in_height == 1 && in_width == 1) ||
-                               in_height != 1);
-
-        //if kernel is pow of 2 and heigher than 8, then the issue doesn't appear
-        auto kernelCheck = convolution._kernel_x > 15 && !(convolution._kernel_x & (convolution._kernel_x - 1));
-        if (!inputDimsCheck && !kernelCheck) {
-            dnn->do_rotate_output = true;
-            dnn->num_rotate_output_rows = out_width;
-            dnn->num_rotate_output_columns = out_channels;
-        }
-    }
-
    std::vector<uint8_t> transposedWeights;
    for (uint32_t k = 0; k < convolution._out_depth; k++) {
        uint8_t * ptr_filt_current
@@ -1303,8 +1276,9 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
    auto inputPrecision = quantized ? Precision(Precision::I16) : inputs->getPrecision();

    auto input_data = HasTo2DReshapeData(layer) ? Get2DReshapedData(inputs, 8) : inputs;
-    uint32_t num_rows_in = FROM_IR_DIM(input_data, 1);
-    uint32_t num_columns_in = FROM_IR_DIM(input_data, 2);
+    auto in_dims = input_data->getDims();
+    uint32_t num_rows_in = InferenceEngine::details::product(in_dims) / in_dims.front();
+    uint32_t num_columns_in = in_dims.front();
    uint32_t num_rows_out = isDiag ? num_rows_in : FROM_IR_DIM(outputs, 1);
    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
    uint32_t num_padding_out = isDiag ? num_padding : 0;
--- a/inference-engine/src/gna_plugin/gna_graph_patterns.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_patterns.hpp
@@ -0,0 +1,231 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <legacy/graph_tools.hpp>
+#include <legacy/details/ie_cnn_network_tools.h>
+#include "gna_data_types.hpp"
+#include "gna_graph_tools.hpp"
+#include "gna_plugin_log.hpp"
+#include "gna_upstream_iterator.hpp"
+#include "layers/gna_layer_info.hpp"
+
+namespace GNAPluginNS {
+
+/**
+ * @brief searchs for a pattern: Permute(0,3,1,2) -> ... -> Convolution -> ... -> Permute(0,2,3,1) or
+ *        Reshape -> ... -> Convolution -> ... -> Permute(0,2,3,1) if Convolution has only one input dimension not equal to 1
+ * @param layer convolution layer
+ * @return the found permutations before and after convolution
+ */
+inline std::pair<InferenceEngine::CNNLayerPtr, InferenceEngine::CNNLayerPtr> FindPermutationsAroundConvolutionInNHWCModel(
+    InferenceEngine::CNNLayerPtr layer) {
+    // Skip a convolution which doesn't have previous or next layers
+    if (layer->outData.size() != 1) {
+        return std::make_pair(nullptr, nullptr);
+    }
+
+    if (getInputTo(layer->outData.front()).empty()) {
+        return std::make_pair(nullptr, nullptr);
+    }
+
+    if (!InferenceEngine::CNNNetHasPrevLayer(layer.get())) {
+        return std::make_pair(nullptr, nullptr);
+    }
+
+    auto next = getInputTo(layer->outData.front()).begin()->second;
+    // Permute is inserted before Reshape by MO in NHWC models, so we need to find either permute, or reshape, or output
+    while (!LayerInfo(next).isPermute() && !LayerInfo(next).isNonFunctional() && !LayerInfo(next).isOutput() &&
+           next->outData.size() == 1) {
+        auto input_to = getInputTo(next->outData.front());
+        if (input_to.size() != 1) break;
+        next = input_to.begin()->second;
+    }
+
+    // Check if the found layer is NCHW to NHWC permute, if it's not just skip this convolution
+    if (!LayerInfo(next).isPermute() || next->input()->getLayout() != InferenceEngine::Layout::NCHW ||
+        next->GetParamAsInts("order") != GetPermuteOrder(InferenceEngine::Layout::NCHW, InferenceEngine::Layout::NHWC)) {
+        return std::make_pair(nullptr, nullptr);
+    }
+
+    // Permute is inserted after Reshape by MO in NHWC models, so we need to find either permute, or reshape, or input
+    auto parent = InferenceEngine::CNNNetPrevLayer(layer);
+    auto prev = parent;
+    while (!LayerInfo(prev).isPermute() && !LayerInfo(prev).isNonFunctional() && !LayerInfo(prev).isInput() &&
+           InferenceEngine::CNNNetHasPrevLayer(prev.get())) {
+        prev = InferenceEngine::CNNNetPrevLayer(prev);
+    }
+    // Check if the found layer is NHWC to NCHW permute or have 1D data, if it's not just skip this convolution
+    if (LayerInfo(prev).isPermute()) {
+        if (prev->outData[0]->getLayout() != InferenceEngine::Layout::NCHW ||
+            prev->GetParamAsInts("order") != GetPermuteOrder(InferenceEngine::Layout::NHWC, InferenceEngine::Layout::NCHW)) {
+            return std::make_pair(nullptr, nullptr);
+        }
+    } else  {
+        if (parent->outData.size() != 1 || InferenceEngine::getInputTo(parent->outData[0]).size() != 1) {
+            return std::make_pair(nullptr, nullptr);
+        }
+        auto parent_dims = parent->outData[0]->getDims();
+        // Check if the previous layer has all dimensions except one to be equal to 1
+        if (std::count_if(std::begin(parent_dims), std::end(parent_dims), [](size_t dim) { return dim != 1; }) > 1) {
+            return std::make_pair(nullptr, nullptr);
+        }
+    }
+    return std::make_pair(prev, next);
+}
+
+/**
+ * @brief searches for a pattern Convolution -> ... -> Permute(0,3,2,1) -> ... -> ScaleShift | FullyConnected
+ * @param layer convolution layer
+ * @return the found permutation layer
+ */
+inline InferenceEngine::CNNLayerPtr FindPermutationAfterConvolutionInKaldiModel(InferenceEngine::CNNLayerPtr layer) {
+    // Skip a convolution which doesn't have next layers
+    if (layer->outData.size() != 1) {
+        return nullptr;
+    }
+
+    if (getInputTo(layer->outData.front()).empty()) {
+        return nullptr;
+    }
+
+    /* Permute is inserted between a convolution and a scaleshift|fullyconnected layer by MO in Kaldi models,
+     * so we need to fing either permute, or fullyconnected, or scaleshift, or output, or reshape to 2D
+     */
+    auto next = getInputTo(layer->outData.front()).begin()->second;
+    while (!LayerInfo(next).isPermute() && !LayerInfo(next).isFullyConnected() && !LayerInfo(next).isScaleShift() &&
+           !LayerInfo(next).isOutput() &&
+           (!LayerInfo(next).isNonFunctional() || next->outData[0]->getDims().size() == next->input()->getDims().size())) {
+        next = getInputTo(next->outData.front()).begin()->second;
+    }
+
+    // Check if the found layer is NCHW to NWHC permute
+    if (!LayerInfo(next).isPermute() || next->input()->getLayout() != InferenceEngine::Layout::NCHW ||
+        next->GetParamAsInts("order") != std::vector<int>{0, 3, 2, 1}) {
+        return nullptr;
+    }
+
+    return next;
+}
+
+/**
+ * @brief identifies if a model must be converted to NHWC, it must not be neither NHWC, nor Kaldi
+ * @param layers model sorted layers
+ */
+inline bool MustBeConvertedFromNCHWToNHWC(const std::vector<InferenceEngine::CNNLayerPtr> &layers) {
+    for (auto& l : layers) {
+        if (!LayerInfo(l).isConvolution()) continue;
+
+        InferenceEngine::CNNLayerPtr next;
+        std::tie(std::ignore, next) = FindPermutationsAroundConvolutionInNHWCModel(l);
+        if (next != nullptr) return false;
+        // If a convolution has only 1-dimension input and output we should skip it
+        auto in_dims = l->insData.begin()->lock()->getDims();
+        auto out_dims = l->outData.front()->getDims();
+        if (std::count_if(std::begin(in_dims), std::end(in_dims), [](size_t dim) { return dim != 1; }) <= 1 &&
+            std::count_if(std::begin(out_dims), std::end(out_dims), [](size_t dim) { return dim != 1; }) <= 1) {
+            continue;
+        }
+
+        return FindPermutationAfterConvolutionInKaldiModel(l) == nullptr;
+    }
+    return false;
+}
+
+/**
+ * @brief returns rotation information for a layer based on the previous convolution or pooling dimensions order
+ * @param layer layer from which rotation info search must be started
+ * @return bool value which identifies if rotation info is found and rotation information
+ */
+inline std::vector<TranspositionInfo> FindTranspositionInfoFromPrevLayers(InferenceEngine::CNNLayerPtr layer) {
+    std::function<std::vector<TranspositionInfo>(InferenceEngine::CNNLayerPtr)> findTranspositionInfoRecursive =
+        [&findTranspositionInfoRecursive](InferenceEngine::CNNLayerPtr layer) -> std::vector<TranspositionInfo> {
+        if (LayerInfo(layer).isSplit()) {
+            THROW_GNA_EXCEPTION << layer->name << " Failed to find transposition info";
+        }
+
+        if (LayerInfo(layer).isConvolution() || LayerInfo(layer).isPooling()) {
+            auto out_dims = layer->outData[0]->getDims();
+            return {{true, out_dims[1], out_dims[2] * out_dims[3]}};
+        }
+
+        /* If a fullyconnected or input layers are reached, it means that transposition isn't needed, but we should keep
+         * its output size to skip this part during transposition if transposed layer is a result of concatination */
+        if (LayerInfo(layer).isFullyConnected() || LayerInfo(layer).isInput()) {
+            auto out_dims = layer->outData[0]->getDims();
+            return {{false, 1, InferenceEngine::details::product(std::begin(out_dims) + 1, std::end(out_dims))}};
+        }
+
+        // If an eltwise is reached we should follow only one not-const direction
+        if (LayerInfo(layer).isEltwise()) {
+            auto input1 = InferenceEngine::CNNNetPrevLayer(layer, 0);
+            auto input2 = InferenceEngine::CNNNetPrevLayer(layer, 1);
+            if (LayerInfo(input1).isConst()) return findTranspositionInfoRecursive(input2);
+            return findTranspositionInfoRecursive(input1);
+        }
+
+        std::vector<TranspositionInfo> transpositionInfo;
+        for (int idx = 0; idx < layer->insData.size(); ++idx) {
+            if (!InferenceEngine::CNNNetHasPrevLayer(layer.get(), idx)) continue;
+            auto inputLayer = InferenceEngine::CNNNetPrevLayer(layer, idx);
+            // If a concat input is a const we should keep its size to skip this part during transposition
+            if (LayerInfo(layer).isConcat() && LayerInfo(inputLayer).isConst()) {
+                auto in_dims = layer->insData[idx].lock()->getDims();
+                auto data_size = InferenceEngine::details::product(std::begin(in_dims) + 1, std::end(in_dims));
+                transpositionInfo.push_back({false, 1, data_size});
+            } else {
+                std::vector<TranspositionInfo> results = findTranspositionInfoRecursive(inputLayer);
+                transpositionInfo.insert(std::end(transpositionInfo), std::begin(results), std::end(results));
+            }
+        }
+        return transpositionInfo;
+    };
+    return findTranspositionInfoRecursive(layer);
+}
+
+/**
+ * @brief returns rotation information for a layer based on the next convolution layer dimensions order
+ * @param layer layer from which rotation info search must be started
+ * @return bool value which identifies if rotation info is found and rotation information
+ */
+inline std::vector<TranspositionInfo> FindTranspositionInfoFromNextLayers(InferenceEngine::CNNLayerPtr layer) {
+    std::function<std::vector<TranspositionInfo>(InferenceEngine::CNNLayerPtr)> findTranspositionInfoRecursive =
+        [&findTranspositionInfoRecursive](InferenceEngine::CNNLayerPtr layer) -> std::vector<TranspositionInfo> {
+        if (LayerInfo(layer).isConcat()) return {};
+
+        if (LayerInfo(layer).isConvolution()) {
+            auto in_dims = layer->input()->getDims();
+            return {{true, in_dims[1], in_dims[2] * in_dims[3]}};
+        }
+
+        /* If a fullyconnected or output layers are reached, it means that transposition isn't needed, but we should keep
+         * its input size to skip this part during transposition if transposed layer is splitting */
+        if (LayerInfo(layer).isFullyConnected() || LayerInfo(layer).isOutput()) {
+            auto in_dims = layer->input()->getDims();
+            return {{false, 1, InferenceEngine::details::product(std::begin(in_dims) + 1, std::end(in_dims))}};
+        }
+
+        std::vector<TranspositionInfo> transpositionInfo;
+        for (const auto &output : layer->outData) {
+            if (getInputTo(output).empty()) continue;
+            std::vector<TranspositionInfo> results;
+            // Return transposition info from the first branch where convolution is found
+            for (const auto &inputTo : getInputTo(output)) {
+                results = findTranspositionInfoRecursive(inputTo.second);
+                auto found = std::find_if(std::begin(results), std::end(results), [](const TranspositionInfo & result) {
+                    return result.transpose;
+                });
+                if (found != std::end(results)) break;
+            }
+            if (results.empty()) {
+                THROW_GNA_EXCEPTION << layer->name << " Failed to find transposition info";
+            }
+            transpositionInfo.insert(std::end(transpositionInfo), std::begin(results), std::end(results));
+        }
+        return transpositionInfo;
+    };
+
+    return findTranspositionInfoRecursive(layer);
+}
+
+} // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -651,10 +651,10 @@ std::vector<std::pair<CNNLayerPtr, int> > CNNNetGetPrevLayersSkip(CNNLayerPtr or
 * @brief remove given layer from topology, currently only layers with one input data and one output data supported
 */
 inline void CNNNetworkRemoveLayer(CNNLayerPtr layer, bool checkDims = true) {
-    gnalog() << "Removing " << layer->name << " layer\n";
    if (!layer) {
        THROW_IE_EXCEPTION << "Cannot remove layer pointed to NULL";
    }
+    gnalog() << "Removing " << layer->name << " layer\n";
    if (layer->insData.size() != 1) {
        THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 input";
    }
--- a/inference-engine/src/gna_plugin/gna_model_serial.cpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -112,14 +112,16 @@ GNAPluginNS::HeaderLatest::ModelHeader GNAModelSerial::ReadHeader(std::istream &
                    header = HeaderLatest::ModelHeader(tempHeader2dot1);
                    break;
                case 2:
-                case 3: {
-                    Header2dot3::ModelHeader tempHeader2dot3;
-                    readBits(tempHeader2dot3, is);
-                    header = HeaderLatest::ModelHeader(tempHeader2dot3);
+                case 3:
+                case 4:
+                {
+                    Header2dot4::ModelHeader tempHeader2dot4;
+                    readBits(tempHeader2dot4, is);
+                    header = HeaderLatest::ModelHeader(tempHeader2dot4);
                    break;
                }
-                case 4:
-                    readNBytes(&header, sizeof(Header2dot4::ModelHeader), is);
+                case 5:
+                    readNBytes(&header, sizeof(Header2dot5::ModelHeader), is);
                    break;
                default:
                    THROW_GNA_EXCEPTION << "Imported file unsupported. minor version should have values in range 1 to 4 and is: " << header.version.minor;
@@ -172,7 +174,9 @@ void GNAModelSerial::Import(void *basePointer,
        std::shared_ptr<GNAPluginNS::InputDesc> inputsDesc,
        std::vector<GNAPluginNS::OutputDesc> &desc,
        InferenceEngine::InputsDataMap& inputsDataMap,
-        InferenceEngine::OutputsDataMap& outputsDataMap) {
+        InferenceEngine::OutputsDataMap& outputsDataMap,
+        TranspositionInfoMap& inputsTranspositionInfo,
+        TranspositionInfoMap& outputsTranspositionInfo) {
    is.exceptions(std::istream::failbit);

    if (modelHeader.version.major == 2) {
@@ -185,6 +189,20 @@ void GNAModelSerial::Import(void *basePointer,
                inputNames.push_back(inName.substr(0, nameSize - 1));
            }
        }
+        if (modelHeader.version.minor >= 5) {
+            for (int inputIx = 0; inputIx < modelHeader.nTransposeInputs; ++inputIx) {
+                std::string inputName;
+                std::vector<TranspositionInfo> transpositionInfo;
+                ImportTranspositionInfo(is, inputName, transpositionInfo);
+                inputsTranspositionInfo[inputName] = transpositionInfo;
+            }
+            for (int outputIx = 0; outputIx < modelHeader.nTransposeOutputs; ++outputIx) {
+                std::string outputName;
+                std::vector<TranspositionInfo> transpositionInfo;
+                ImportTranspositionInfo(is, outputName, transpositionInfo);
+                outputsTranspositionInfo[outputName] = transpositionInfo;
+            }
+        }
    }
    ImportInputs(is, basePointer, inputsDesc, inputsDataMap);

@@ -322,6 +340,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
        out.orientation = ep.orientation;
        return out;
    };
+
    /**
     * writing header
     */
@@ -336,12 +355,8 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
    header.nGroup = guessGrouping(*gna2Model);
    header.nInputs = inputs.size();
    header.nOutputs = outputs.size();
-    header.nRotateRows = nRotateRows;
-    header.nRotateColumns = nRotateColumns;
-    header.doRotateInput = doRotateInput;
-    header.nRotateOutputRows = nRotateOutputRows;
-    header.nRotateOutputColumns = nRotateOutputColumns;
-    header.doRotateOutput = doRotateOutput;
+    header.nTransposeInputs = transposeInputsInfo.size();
+    header.nTransposeOutputs = transposeOutputsInfo.size();

    writeBits(header, os);

@@ -350,6 +365,8 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
        writeBits(static_cast<uint32_t>(nameSize), os);
        writeNBytes(name.c_str(), nameSize , os);
    }
+    ExportTranspositionInfo(os, transposeInputsInfo);
+    ExportTranspositionInfo(os, transposeOutputsInfo);
    for (const auto &input : inputs) {
        writeBits(convert_to_serial(input), os);
    }
@@ -416,9 +433,27 @@ void GNAModelSerial::Import(void *basePointer,
        std::shared_ptr<GNAPluginNS::InputDesc> inputsDesc,
        std::vector<GNAPluginNS::OutputDesc> &desc,
        InferenceEngine::InputsDataMap& inputsDataMap,
-        InferenceEngine::OutputsDataMap& outputsDataMap) {
+        InferenceEngine::OutputsDataMap& outputsDataMap,
+        TranspositionInfoMap& inputsTranspositionInfo,
+        TranspositionInfoMap& outputsTranspositionInfo) {
    is.exceptions(std::istream::failbit);

+    if (modelHeader.version.major == 2) {
+        if (modelHeader.version.minor >= 5) {
+            for (int inputIx = 0; inputIx < modelHeader.nTransposeInputs; ++inputIx) {
+                std::string inputName;
+                std::vector<TranspositionInfo> transpositionInfo;
+                ImportTranspositionInfo(is, inputName, transpositionInfo);
+                inputsTranspositionInfo[inputName] = transpositionInfo;
+            }
+            for (int outputIx = 0; outputIx < modelHeader.nTransposeOutputs; ++outputIx) {
+                std::string outputName;
+                std::vector<TranspositionInfo> transpositionInfo;
+                ImportTranspositionInfo(is, outputName, transpositionInfo);
+                outputsTranspositionInfo[outputName] = transpositionInfo;
+            }
+        }
+    }
    ImportInputs(is, basePointer, inputsDesc, inputsDataMap);
    ImportOutputs(is, basePointer, desc, outputsDataMap);

@@ -579,6 +614,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
        out.orientation = ep.orientation;
        return out;
    };
+
    /**
     * writing header
     */
@@ -595,9 +631,11 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
    header.nInputs = 1;
    header.nOutputs = 1;
    header.headerSize = sizeof(HeaderLatest::ModelHeader);
-    header.nRotateRows = nRotateRows;
-    header.nRotateColumns = nRotateColumns;
+    header.nTransposeInputs = transposeInputsInfo.size();
+    header.nTransposeOutputs = transposeOutputsInfo.size();

+    ExportTranspositionInfo(os, transposeInputsInfo);
+    ExportTranspositionInfo(os, transposeOutputsInfo);

    writeBits(header, os);
    writeBits(convert_to_serial(inputs[0]), os);
@@ -785,6 +823,36 @@ void GNAModelSerial::ImportOutputs(std::istream &is,
    }
 }

+void GNAModelSerial::ImportTranspositionInfo(std::istream &is,
+        std::string &name,
+        std::vector<TranspositionInfo> &transpositionInfo) {
+    uint32_t nameSize = 0;
+    readNBits<32>(nameSize, is);
+    name.resize(nameSize, '\0');
+    readNBytes(&name[0], nameSize, is);
+    uint32_t transposeFragmentsSize = 0;
+    readNBits<32>(transposeFragmentsSize, is);
+    for (int rotFragmIx = 0; rotFragmIx < transposeFragmentsSize; ++rotFragmIx) {
+        TranspositionInfo fragmentTranspositionInfo;
+        readNBytes(&fragmentTranspositionInfo, sizeof(TranspositionInfo), is);
+        transpositionInfo.push_back(fragmentTranspositionInfo);
+    }
+}
+
+void GNAModelSerial::ExportTranspositionInfo(std::ostream &os,
+        const TranspositionInfoMap &transpositionInfoMap) const {
+    for (const auto &transpositionInfo : transpositionInfoMap) {
+        auto nameSize = strlen(transpositionInfo.first.c_str()) + 1;
+        writeBits(static_cast<uint32_t>(nameSize), os);
+        writeNBytes(transpositionInfo.first.c_str(), nameSize, os);
+        auto fragmentsNum = transpositionInfo.second.size();
+        writeBits(static_cast<uint32_t>(fragmentsNum), os);
+        for (const auto &transposeFragmentInfo : transpositionInfo.second) {
+            writeNBytes(&transposeFragmentInfo, sizeof(TranspositionInfo), os);
+        }
+    }
+}
+
 void GNAModelSerial::setHeader(HeaderLatest::ModelHeader header) {
    modelHeader = header;
 }
--- a/inference-engine/src/gna_plugin/gna_model_serial.hpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -34,12 +34,8 @@ private:
    std::vector<GNAPluginNS::HeaderLatest::RuntimeEndPoint> outputs;
    std::vector<std::string> inputNames;
    std::vector<std::string> outputNames;
-    uint32_t nRotateRows = 0;
-    uint32_t nRotateColumns = 0;
-    bool doRotateInput = false;
-    uint32_t nRotateOutputRows = 0;
-    uint32_t nRotateOutputColumns = 0;
-    bool doRotateOutput = false;
+    TranspositionInfoMap transposeInputsInfo;
+    TranspositionInfoMap transposeOutputsInfo;

    MemoryType states, *pstates = nullptr;
    GNAPluginNS::HeaderLatest::ModelHeader modelHeader;
@@ -54,6 +50,13 @@ private:
            std::vector<GNAPluginNS::OutputDesc> &desc,
            InferenceEngine::OutputsDataMap& dataMap);

+    void ImportTranspositionInfo(std::istream &is,
+            std::string &name,
+            std::vector<TranspositionInfo> &transpositionInfo);
+
+    void ExportTranspositionInfo(std::ostream &os,
+            const TranspositionInfoMap &transpositionInfoMap) const;
+
 public:
 #if GNA_LIB_VER == 2
    GNAModelSerial(Gna2Model * model, MemoryType & states_holder)
@@ -105,17 +108,13 @@ private:
     }
 #endif

-    GNAModelSerial & SetInputRotation(uint32_t nRotateRows, uint32_t nRotateColumns, bool do_rotate_inputs) {
-      this->nRotateColumns = nRotateColumns;
-      this->nRotateRows = nRotateRows;
-      this->doRotateInput = do_rotate_inputs;
+    GNAModelSerial & SetInputRotation(const TranspositionInfoMap &transposeInputsInfo) {
+      this->transposeInputsInfo = transposeInputsInfo;
      return *this;
    }

-    GNAModelSerial& SetOutputRotation(uint32_t nRotateOutputRows, uint32_t nRotateOutputColumns, bool do_rotate_outputs) {
-        this->nRotateOutputColumns = nRotateOutputColumns;
-        this->nRotateOutputRows = nRotateOutputRows;
-        this->doRotateOutput = do_rotate_outputs;
+    GNAModelSerial& SetOutputRotation(const TranspositionInfoMap &transposeOutputsInfo) {
+        this->transposeOutputsInfo = transposeOutputsInfo;
        return *this;
    }

@@ -145,12 +144,14 @@ private:
     * @param is - stream without header structure - TBD heder might be needed
     */
    void Import(void *basePointer,
-                                size_t gnaGraphSize,
-                                std::istream & is,
-                                std::shared_ptr<GNAPluginNS::InputDesc> inputsDesc,
-                                std::vector<GNAPluginNS::OutputDesc> &desc,
-                                InferenceEngine::InputsDataMap& inputsDataMap,
-                                InferenceEngine::OutputsDataMap& outputsDataMap);
+                size_t gnaGraphSize,
+                std::istream & is,
+                std::shared_ptr<GNAPluginNS::InputDesc> inputsDesc,
+                std::vector<GNAPluginNS::OutputDesc> &desc,
+                InferenceEngine::InputsDataMap& inputsDataMap,
+                InferenceEngine::OutputsDataMap& outputsDataMap,
+                TranspositionInfoMap& inputstranspositionInfo,
+                TranspositionInfoMap& outputstranspositionInfo);

    /**
     * save gna graph to an outpus stream
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -38,6 +38,7 @@
 #include "gna_model_serial.hpp"
 #include "runtime/gna_float_runtime.hpp"
 #include <layers/gna_fake_quantize_layer.hpp>
+#include "gna_graph_patterns.hpp"

 #include <generic_ie.hpp>
 #include <ngraph/pass/manager.hpp>
@@ -431,6 +432,197 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
    }
 }

+static void TransposeTensorFromNCHWToNHWC(size_t precision, size_t rows, size_t columns, uint8_t* buffer, bool transpose_rows,
+                                          const std::vector<TranspositionInfo> &transpositionInfo) {
+    size_t weightsTotalSize = rows * columns * precision;
+    std::vector<uint8_t> transposedWeights(weightsTotalSize);
+    size_t weightsPartOffset = 0;
+    bool transposed = false;
+    for (const auto &transpositionInfoPart : transpositionInfo) {
+        auto partSize = transpositionInfoPart.num_transpose_rows * transpositionInfoPart.num_transpose_columns;
+        size_t weightsPartSize = partSize * precision * (transpose_rows ? rows : columns);
+        if (transpositionInfoPart.transpose &&
+            transpositionInfoPart.num_transpose_rows != 1 &&
+            transpositionInfoPart.num_transpose_columns != 1) {
+            if (transpose_rows) {
+                for (int weightsRowIx = 0; weightsRowIx < rows; ++weightsRowIx) {
+                    auto weightsRowsOffset = weightsRowIx * partSize * precision;
+                    auto cbuffer = buffer + weightsPartOffset + weightsRowsOffset;
+                    auto weights_ptr = transposedWeights.data() + weightsPartOffset + weightsRowsOffset;
+                    for (int colsIx = 0; colsIx < transpositionInfoPart.num_transpose_columns; ++colsIx) {
+                        for (int rowIx = 0; rowIx < transpositionInfoPart.num_transpose_rows; ++rowIx) {
+                            auto offsetWrite = (colsIx * transpositionInfoPart.num_transpose_rows + rowIx) * precision;
+                            auto offsetRead = (transpositionInfoPart.num_transpose_columns * rowIx + colsIx) * precision;
+                            ie_memcpy(weights_ptr + offsetWrite, weightsPartSize - weightsRowsOffset - offsetWrite,
+                                cbuffer + offsetRead, precision);
+                        }
+                    }
+                }
+            } else {
+                auto cbuffer = buffer + weightsPartOffset;
+                auto weights_ptr = transposedWeights.data() + weightsPartOffset;
+                for (int colsIx = 0; colsIx < transpositionInfoPart.num_transpose_columns; ++colsIx) {
+                    for (int rowIx = 0; rowIx < transpositionInfoPart.num_transpose_rows; ++rowIx) {
+                        auto offsetWrite = (colsIx * transpositionInfoPart.num_transpose_rows + rowIx) * columns * precision;
+                        auto offsetRead = (transpositionInfoPart.num_transpose_columns * rowIx + colsIx) * columns * precision;
+                        ie_memcpy(weights_ptr + offsetWrite, weightsPartSize - offsetWrite, cbuffer + offsetRead, columns * precision);
+                    }
+                }
+            }
+            transposed = true;
+        } else {
+            // Just copy data which should not be transposed
+            ie_memcpy(transposedWeights.data() + weightsPartOffset,
+                      weightsPartSize,
+                      buffer + weightsPartOffset,
+                      weightsPartSize);
+        }
+        weightsPartOffset += weightsPartSize;
+    }
+    if (transposed) {
+        ie_memcpy(buffer, weightsTotalSize, transposedWeights.data(), weightsTotalSize);
+    }
+}
+
+void GNAPlugin::ConvertModelLayoutFromNCHWToNHWC(const std::vector<CNNLayerPtr> &layers) {
+    auto printTranspositionInfo = [](const std::vector<TranspositionInfo> &transpositionInfo) {
+        for (const auto &transpositionInfoPart : transpositionInfo) {
+            gnalog() << "transpose=" << transpositionInfoPart.transpose << " rows_num=" << transpositionInfoPart.num_transpose_rows
+                     << " columns_num=" << transpositionInfoPart.num_transpose_columns << "\n";
+        }
+    };
+
+    auto foundPartToTranspose = [](const std::vector<TranspositionInfo> &transpositionInfo) {
+        auto partToTranspose = std::find_if(std::begin(transpositionInfo), std::end(transpositionInfo),
+            [](const TranspositionInfo &infoPart) { return infoPart.transpose; });
+        return partToTranspose != std::end(transpositionInfo);
+    };
+
+    for (auto& l : layers) {
+        // Collect information for inputs transposition
+        if (LayerInfo(l).isInput()) {
+            auto transpositionInfo = FindTranspositionInfoFromNextLayers(l);
+            if (!transpositionInfo.empty()) {
+                transpose_inputs_info.insert({l->name, transpositionInfo});
+                gnalog() << "Input " << l->name << " transposition info: \n";
+                printTranspositionInfo(transpositionInfo);
+            }
+        }
+
+        // Collect information for outputs transposition
+        if (LayerInfo(l).isOutput()) {
+            auto transpositionInfo = FindTranspositionInfoFromPrevLayers(l);
+            if (!transpositionInfo.empty()) {
+                // Swap transposition info rows and columns since we need to transpose output back from NHWC to NCHW
+                for (auto && transpositionInfoPart : transpositionInfo) {
+                    if (transpositionInfoPart.transpose) {
+                        std::swap(transpositionInfoPart.num_transpose_rows, transpositionInfoPart.num_transpose_columns);
+                    }
+                }
+                transpose_outputs_info.insert({l->name, transpositionInfo});
+                gnalog() << "Output " << l->name << " transposition info: \n";
+                printTranspositionInfo(transpositionInfo);
+            }
+        }
+
+        // Transpose weights
+        if (LayerInfo(l).isScaleShift()) {
+            std::vector<TranspositionInfo> transpositionInfo;
+            // Try to find a convolution in previous layers
+            if (InferenceEngine::CNNNetHasPrevLayer(l.get())) {
+                transpositionInfo = FindTranspositionInfoFromPrevLayers(InferenceEngine::CNNNetPrevLayer(l));
+                // If no convolutions are found try to find them in next layers
+                if (!foundPartToTranspose(transpositionInfo)) {
+                    transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
+                }
+            }
+            if (!transpositionInfo.empty()) {
+                auto weightable = dynamic_cast<WeightableLayer*>(l.get());
+                IE_ASSERT(weightable != nullptr);
+                TransposeTensorFromNCHWToNHWC(weightable->precision.size(), 1, weightable->_weights->size(),
+                    weightable->_weights->cbuffer().as<uint8_t*>(), true, transpositionInfo);
+                if (weightable->_biases) {
+                    TransposeTensorFromNCHWToNHWC(weightable->precision.size(), 1, weightable->_biases->size(),
+                        weightable->_biases->cbuffer().as<uint8_t*>(), true, transpositionInfo);
+                }
+                gnalog() << l->name << " weights and biases rows transposition info:\n";
+                printTranspositionInfo(transpositionInfo);
+            }
+        }
+
+        if (LayerInfo(l).isFullyConnected()) {
+            auto weightable = dynamic_cast<WeightableLayer*>(l.get());
+            IE_ASSERT(weightable != nullptr);
+            auto precision = weightable->precision.size();
+            auto out_dims = l->outData[0]->getDims();
+            auto in_dims = l->input()->getDims();
+            auto weightsRows = InferenceEngine::details::product(std::begin(out_dims) + 1, std::end(out_dims));
+            auto weightsColumns = InferenceEngine::details::product(std::begin(in_dims) + 1, std::end(in_dims));
+            // Find a convolution in previous layers to rotate weights rows
+            if (InferenceEngine::CNNNetHasPrevLayer(l.get())) {
+                auto transpositionInfo = FindTranspositionInfoFromPrevLayers(InferenceEngine::CNNNetPrevLayer(l));
+                if (!transpositionInfo.empty()) {
+                    size_t totalColumns = 0;
+                    for (auto && transpositionInfoPart : transpositionInfo) {
+                        totalColumns += transpositionInfoPart.num_transpose_rows * transpositionInfoPart.num_transpose_columns;
+                    }
+                    if (weightsColumns != totalColumns) {
+                        THROW_GNA_EXCEPTION << l->name << " weights columns from transposition info (" << totalColumns
+                                            << ") don't match input dimensions (" << weightsColumns << ")";
+                    }
+                    TransposeTensorFromNCHWToNHWC(precision, weightsRows, weightsColumns, weightable->_weights->cbuffer().as<uint8_t*>(),
+                                                  true, transpositionInfo);
+                    gnalog() << l->name << " weights rows transposition info:\n";
+                    printTranspositionInfo(transpositionInfo);
+                }
+            }
+            // Find a convolution in next layers to rotate weights columns
+            if (!l->outData.empty() && !getInputTo(l->outData[0]).empty()) {
+                auto transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
+                if (!transpositionInfo.empty()) {
+                    size_t totalRows = 0;
+                    for (const auto& transpositionInfoPart : transpositionInfo) {
+                        totalRows += transpositionInfoPart.num_transpose_rows * transpositionInfoPart.num_transpose_columns;
+                    }
+                    if (weightsRows != totalRows) {
+                        THROW_GNA_EXCEPTION << l->name << "weights rows from transposition info (" << totalRows
+                                            << ") don't match output dimensions (" << weightsRows << ")";
+                    }
+                    TransposeTensorFromNCHWToNHWC(precision, weightsRows, weightsColumns, weightable->_weights->cbuffer().as<uint8_t*>(),
+                                                  false, transpositionInfo);
+                    gnalog() << l->name << " weights columns transposition info:\n";
+                    printTranspositionInfo(transpositionInfo);
+                }
+            }
+        }
+
+        if (LayerInfo(l).isEltwise()) {
+            // We need to transpose a constant which is an eltwise input
+            auto firstInput = InferenceEngine::CNNNetPrevLayer(l, 0);
+            auto secondInput = InferenceEngine::CNNNetPrevLayer(l, 1);
+            if (!LayerInfo(firstInput).isConst() && !LayerInfo(secondInput).isConst()) {
+                continue;
+            }
+            // Let a constant to be the second input
+            if (LayerInfo(firstInput).isConst()) {
+                std::swap(firstInput, secondInput);
+            }
+            // Find a convolution in previous or next layers
+            auto transpositionInfo = FindTranspositionInfoFromPrevLayers(firstInput);
+            if (!foundPartToTranspose(transpositionInfo)) {
+                transpositionInfo = FindTranspositionInfoFromNextLayers(getInputTo(l->outData[0]).begin()->second);
+            }
+            if (!transpositionInfo.empty()) {
+                auto blob = secondInput->blobs["custom"];
+                TransposeTensorFromNCHWToNHWC(blob->getTensorDesc().getPrecision().size(), 1, blob->size(),
+                                              blob->buffer().as<uint8_t*>(), true, transpositionInfo);
+                gnalog() << l->name << " data transposition info:\n";
+                printTranspositionInfo(transpositionInfo);
+            }
+        }
+    }
+}
+
 void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
    std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
    if (_network.getFunction()) {
@@ -474,6 +666,11 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
    UpdateGnaQuantModeFromNetwork(network);
    UpdateInputScaleFromNetwork(network);

+    auto layers = details::CNNNetSortTopologically(network);
+    if (MustBeConvertedFromNCHWToNHWC(layers)) {
+        ConvertModelLayoutFromNCHWToNHWC(layers);
+    }
+
    // network optimisation phases
    int passIdx = 0;
    auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy) {
@@ -561,7 +758,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {

 #ifdef PLOT
    std::ofstream file("gna_passes.dot");
-    saveGraphToDot(*newNet, file, [](const CNNLayerPtr layer,
+    saveGraphToDot(newNet, file, [](const CNNLayerPtr layer,
                                           ordered_properties &printed_properties,
                                           ordered_properties &node_properties) {
        // printing quantized params
@@ -873,13 +1070,12 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
        }
    }

-    do_rotate_input = dnn->do_rotate_input;
-    num_rotate_rows = dnn->num_rotate_rows;
-    num_rotate_columns = dnn->num_rotate_columns;
-
-    do_rotate_output = dnn->do_rotate_output;
-    num_rotate_output_rows = dnn->num_rotate_output_rows;
-    num_rotate_output_columns = dnn->num_rotate_output_columns;
+    if (dnn->do_rotate_input && transpose_inputs_info.empty()) {
+        for (auto &inputLayer : inputLayers) {
+            transpose_inputs_info.insert({inputLayer->name,
+                {TranspositionInfo{dnn->do_rotate_input, dnn->num_rotate_rows, dnn->num_rotate_columns}}});
+        }
+    }

    DumpXNNToFile();

@@ -962,33 +1158,6 @@ void GNAPlugin::DumpXNNToFile() const {
 #endif
 }

-void RotateFeatures(uint8_t *ptr_feat,
-                    size_t element_size,
-                    uint32_t num_feature_vectors,
-                    uint32_t num_feature_vector_elements,
-                    uint32_t num_rotate_rows,
-                    uint32_t num_rotate_columns) {
-    if (num_feature_vector_elements == num_rotate_rows * num_rotate_columns) {
-        std::vector<uint8_t> temp(num_feature_vector_elements * element_size);
-        for (uint32_t k = 0; k < num_feature_vectors; k++) {
-            uint8_t *ptr_in = ptr_feat + k * num_feature_vector_elements * element_size;
-            for (uint32_t i = 0; i < num_rotate_rows; i++) {
-                for (uint32_t j = 0; j < num_rotate_columns; j++) {
-                    ie_memcpy(&temp.front() + (j * num_rotate_rows + i)*element_size,
-                              temp.size() - (i * num_rotate_columns + j)*element_size,
-                              ptr_in + (i * num_rotate_columns + j)*element_size,
-                              element_size);
-                }
-            }
-            ie_memcpy(ptr_in, num_feature_vector_elements * element_size,
-                &temp.front(), num_feature_vector_elements * element_size);
-        }
-    } else {
-        THROW_GNA_EXCEPTION << "Rotate dimensions (" << num_rotate_rows << "," << num_rotate_columns
-                           <<") do not match buffer length of "<< num_feature_vector_elements <<" in RotateFeatures()!";
-    }
-}
-
 uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) {
 #if GNA_LIB_VER == 2
    auto& nnets = gnaRequestConfigToRequestIdMap;
@@ -1079,30 +1248,20 @@ uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, Infer
                     importedElements,
                     importedElements);

-        bool CNN2DAtInput = input.second->getTensorDesc().getLayout() == Layout::NCHW && inputOrientation == kDnnNonInterleavedOrientation;
-        bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1;
-        if (do_rotate_input && ((inputLayout == Layout::NC)
-            != (inputOrientation == kDnnInterleavedOrientation))
-            && !isOneChannel
-            && !CNN2DAtInput) {
-            RotateFeatures(reinterpret_cast<uint8_t *>(inputsDesc->getPtrInputsGlobal(input.first)[idx]),
-                           gnadevice ? 2 : 4,
-                           // TODO: only works for cnn4a and google command so far
-                           dims[0],
-                           InferenceEngine::details::product(dims) / dims[0],
-                           num_rotate_rows,
-                           num_rotate_columns);
-        }
-        if (CNN2DAtInput) {
-            auto dims = input.second->getTensorDesc().getDims();
-            auto hwDim = dims[2] * dims[3];
-            auto chanelsDim = dims[1];
-            RotateFeatures(reinterpret_cast<uint8_t*>(inputsDesc->getPtrInputsGlobal(input.first)[idx]),
-                gnadevice ? 2 : 4,
-                dims[0],
-                chanelsDim* hwDim,
-                chanelsDim,
-                hwDim);
+        auto transpose_info = transpose_inputs_info.find(input.first);
+        if (transpose_info != std::end(transpose_inputs_info)) {
+            size_t batchSize = dims[0];
+            size_t elementsPerBatch = InferenceEngine::details::product(dims) / dims[0];
+            size_t transposed_data_size = 0;
+            for (const auto &part_transposition_info : transpose_info->second) {
+                transposed_data_size += part_transposition_info.num_transpose_rows * part_transposition_info.num_transpose_columns;
+            }
+            if (elementsPerBatch != transposed_data_size) {
+                THROW_GNA_EXCEPTION << "Transposed data size (" << transposed_data_size
+                                    << ") do not match input buffer length of " << elementsPerBatch;
+            }
+            auto input_ptr = reinterpret_cast<uint8_t *>(inputsDesc->getPtrInputsGlobal(input.first)[idx]);
+            TransposeTensorFromNCHWToNHWC(gnadevice ? 2 : 4, batchSize, elementsPerBatch, input_ptr, true, transpose_info->second);
        }
        ++inputNum;
    }
@@ -1191,24 +1350,22 @@ GnaWaitStatus GNAPlugin::WaitFor(uint32_t request_idx, int64_t millisTimeout) {
                * exportOutputDims[exportOutputDims.size() - 2]
                * exportOutputDims[exportOutputDims.size() - 3];

-            if (do_rotate_output) {
-                if (batchSize * elementsPerBatch != num_rotate_output_columns * num_rotate_output_rows) {
-                    THROW_GNA_EXCEPTION << "Rotate output dimensions (" << num_rotate_output_rows << "," << num_rotate_output_columns
-                                        << ") do not match output buffer length of " << batchSize * elementsPerBatch;
+            auto transpose_output_info = transpose_outputs_info.find(outputBlobIt.first);
+            if (transpose_output_info != std::end(transpose_outputs_info)) {
+                size_t transposed_data_size = 0;
+                for (const auto &part_transposition_info : transpose_output_info->second) {
+                    transposed_data_size += part_transposition_info.num_transpose_rows * part_transposition_info.num_transpose_columns;
                }
-                uint32_t element_size = outputDesc.num_bytes_per_element;
-                std::vector<uint8_t> temp(num_rotate_output_columns * num_rotate_output_rows * element_size);
-                for (uint32_t k = 0; k < num_rotate_output_columns; ++k) {
-                    uint8_t* ptr_in = reinterpret_cast<uint8_t*>(outputDesc.ptrs[request_idx]) + k * element_size;
-                    for (uint32_t i = 0; i < num_rotate_output_rows; ++i) {
-                        ie_memcpy(&temp.front() + (k *num_rotate_output_rows + i) * element_size,
-                                  element_size,
-                                  ptr_in + (i * num_rotate_output_columns) * element_size,
-                                  element_size);
-                    }
+                if (elementsPerBatch != transposed_data_size) {
+                    THROW_GNA_EXCEPTION << "Transposed data size (" << transposed_data_size
+                                        << ") do not match output buffer length of " << elementsPerBatch;
                }
-                ie_memcpy(outputDesc.ptrs[request_idx], num_rotate_output_columns * num_rotate_output_rows * element_size,
-                    &temp.front(), num_rotate_output_columns * num_rotate_output_rows * element_size);
+                TransposeTensorFromNCHWToNHWC(outputDesc.num_bytes_per_element,
+                                              batchSize,
+                                              elementsPerBatch,
+                                              reinterpret_cast<uint8_t*>(outputDesc.ptrs[request_idx]),
+                                              true,
+                                              transpose_output_info->second);
            }

            ExportScores(outputBlob->buffer(),
@@ -1364,7 +1521,9 @@ InferenceEngine::ExecutableNetwork GNAPlugin::ImportNetwork(std::istream& networ
            inputsDesc,
            outputsDesc,
            inputsDataMap,
-            outputsDataMap);
+            outputsDataMap,
+            transpose_inputs_info,
+            transpose_outputs_info);

 #if GNA_LIB_VER == 2
    auto getOrientation = [](Gna2Operation & gnaOperation) {
@@ -1383,13 +1542,16 @@ InferenceEngine::ExecutableNetwork GNAPlugin::ImportNetwork(std::istream& networ
    outputsDesc[0].orientation = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers - 1]);
 #endif

-    do_rotate_input = header.doRotateInput;
-    num_rotate_rows = header.nRotateRows;
-    num_rotate_columns = header.nRotateColumns;
-
-    do_rotate_output = header.doRotateOutput;
-    num_rotate_output_rows = header.nRotateOutputRows;
-    num_rotate_output_columns = header.nRotateOutputColumns;
+    if (header.doRotateInput) {
+        for (auto && input : inputsDataMap) {
+            transpose_inputs_info.insert({input.first, {{header.doRotateInput, header.nRotateRows, header.nRotateColumns}}});
+        }
+    }
+    if (header.doRotateOutput) {
+        for (auto && output : outputsDataMap) {
+            transpose_outputs_info.insert({output.first, {{header.doRotateOutput, header.nRotateOutputRows, header.nRotateOutputColumns}}});
+        }
+    }

    for (auto && memory : mt) {
        GNAMemoryLayer memoryLayer(nullptr, nullptr, gnaFlags->sw_fp32 ? 4 : 2);
@@ -1441,8 +1603,8 @@ void GNAPlugin::Export(const std::string &fileName) {
                                 outputsDesc,
                                 inputsDataMap,
                                 outputsDataMap)
-                    .SetInputRotation(dnn->num_rotate_rows, dnn->num_rotate_columns, dnn->do_rotate_input)
-                    .SetOutputRotation(dnn->num_rotate_output_rows, dnn->num_rotate_output_columns, dnn->do_rotate_output);
+                    .SetInputRotation(transpose_inputs_info)
+                    .SetOutputRotation(transpose_outputs_info);

    for (auto && memoryConnection : graphCompiler.memory_connection) {
        serial.AddState(memoryConnection.second.gna_ptr, memoryConnection.second.reserved_size);
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -55,12 +55,8 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
 #if GNA_LIB_VER == 2
    uint32_t activeLayerIndex = 0xffffffff;
 #endif
-    bool do_rotate_input = false;
-    uint32_t num_rotate_rows = 0;
-    uint32_t num_rotate_columns = 0;
-    bool do_rotate_output = false;
-    uint32_t num_rotate_output_rows = 0;
-    uint32_t num_rotate_output_columns = 0;
+    TranspositionInfoMap transpose_inputs_info;
+    TranspositionInfoMap transpose_outputs_info;
    uint32_t *ptr_active_indices = nullptr;
    uint32_t num_active_indices = 0;
    uint32_t num_group_in = 0;
@@ -224,6 +220,14 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
    void UpdateFieldsFromConfig();
    void UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork &);
    void UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork &);
+
+    /**
+     * @brief Converts a model from NCHW to NHWC. It fills inputs and outputs transposition info and
+     *        changes weights order for affine, eltwise and scaleshift layers. Information for transposition
+     *        is found from convolution/pooling input or output dimensions.
+     * @param layers model sorted layers
+     */
+    void ConvertModelLayoutFromNCHWToNHWC(const std::vector<InferenceEngine::CNNLayerPtr> &layers);
 };

 }  // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -38,6 +38,7 @@
 #include "gna_upstream_iterator.hpp"
 #include "frontend/quantization.h"
 #include "gna_groups.hpp"
+#include "gna_graph_patterns.hpp"

 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@@ -635,59 +636,19 @@ void RemovePermutationsNHWCToNCHWPass::run() {
            continue;
        }

-        if (l->outData.size() != 1) {
-            continue;
+        CNNLayerPtr prev, next;
+        std::tie(prev, next) = FindPermutationsAroundConvolutionInNHWCModel(l);
+
+        if (prev == nullptr || next == nullptr) continue;
+
+        if (LayerInfo(prev).isPermute() && getPassManager()->getPolicy().NHWCToNCHWPolicy == Policy::NHWCToNCHW::REMOVE_ALL) {
+            permutations_to_remove.insert(prev);
        }

-        if (getInputTo(l->outData.front()).empty()) {
-            continue;
+        if (LayerInfo(next).isPermute()) {
+            permutations_to_remove.insert(next);
        }

-        if (!CNNNetHasPrevLayer(l.get())) {
-            continue;
-        }
-
-        auto next = getInputTo(l->outData.front()).begin()->second;
-        while (!LayerInfo(next).isPermute() && !LayerInfo(next).isNonFunctional() && !LayerInfo(next).isOutput() &&
-               next->outData.size() == 1) {
-            auto input_to = getInputTo(next->outData.front());
-            if (input_to.size() != 1) break;
-            next = input_to.begin()->second;
-        }
-
-        // The next layer must be NCHW to NHWC permute
-        if (!LayerInfo(next).isPermute() || next->input()->getLayout() != Layout::NCHW ||
-            next->GetParamAsInts("order") != GetPermuteOrder(Layout::NCHW, Layout::NHWC)) {
-            continue;
-        }
-
-        auto parent = CNNNetPrevLayer(l);
-        auto prev = parent;
-        while (!LayerInfo(prev).isPermute() && !LayerInfo(prev).isNonFunctional() &&
-               !LayerInfo(prev).isInput() && CNNNetHasPrevLayer(prev.get())) {
-            prev = CNNNetPrevLayer(prev);
-        }
-        // The previous layer must be NHWC to NCHW permute or have 1D data
-        if (LayerInfo(prev).isPermute()) {
-            if (prev->outData[0]->getLayout() != Layout::NCHW ||
-                prev->GetParamAsInts("order") != GetPermuteOrder(Layout::NHWC, Layout::NCHW)) {
-                continue;
-            }
-
-            if (getPassManager()->getPolicy().NHWCToNCHWPolicy == Policy::NHWCToNCHW::REMOVE_ALL) {
-                permutations_to_remove.insert(prev);
-            }
-        } else  {
-            if (parent->outData.size() != 1 || getInputTo(parent->outData[0]).size() != 1) {
-                continue;
-            }
-            auto parent_dims = parent->outData[0]->getDims();
-            // Check if the previous layer has all dimensions except one to be equal to 1
-            if (std::count_if(std::begin(parent_dims), std::end(parent_dims), [](size_t dim) { return dim != 1; }) > 1) {
-                continue;
-            }
-        }
-        permutations_to_remove.insert(next);
        nhwc_layout_patterns.push_back({prev, next});

        auto* convolution = dynamic_cast<ConvolutionLayer*>(l.get());
@@ -2014,7 +1975,7 @@ int PassManager::run(int index) {
        saveGraphToDot(network, out, [](const CNNLayerPtr layer,
                                        ordered_properties &printed_properties,
                                        ordered_properties &node_properties) {});
-        network.serialize(name + ".xml", name + ".bin", nullptr);
+        network.serialize(name + ".xml", name + ".bin");
    };
 #else
    auto dumpNetworkAfterPass = [] (std::shared_ptr<Pass> ) {};
--- a/inference-engine/src/gna_plugin/serial/headers/2dot5/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/2dot5/gna_model_header.hpp
@@ -0,0 +1,144 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include "backend/dnn_types.h"
+#include "serial/headers/2dot4/gna_model_header.hpp"
+#include "gna_data_types.hpp"
+#pragma pack(push, 1)
+
+namespace GNAPluginNS {
+namespace Header2dot5 {
+
+/**
+ * @brief Header version 2.5
+ */
+struct ModelHeader {
+    /**
+     *@brief MagicNumber – GNAM in ascii table, equals to hex 0x474e414d
+     */
+    char gnam[4] = {};
+    /**
+     * @brief if header size is not equal to sizeof ModelHeader - some reserved data append in the end of header
+     * usually it is an indicator of working with version of model different that is current export function produce
+     */
+    uint32_t headerSize = 0u;
+    struct Version {
+        /**
+         * @details Version of format Major – unsigned int, ex: 0x0001
+         * every change in the header or in the layers definition should be reflected in version change
+         * for backward compatibility new parsers can read old versions of model with certain restrictions
+         */
+        uint16_t major = 2u;
+        /**
+         * @details Version of Format Minor – unsigned int,  corresponding to build revision for example
+         * changes in minor version are not affected layout of model
+         */
+        uint32_t minor = 5u;
+    } version;
+    /**
+     * @brief Memory required to be allocated using GNAAlloc()
+     */
+    uint64_t gnaMemSize = 0ull;
+    /**
+     * @brief Number of GNA Layers
+     */
+    uint64_t layersCount = 0ull;
+    /**
+     * @brief Grouping level
+     */
+    uint32_t nGroup = 0u;
+
+    /**
+     * Convolution related setting - they are affecting input transformation
+     */
+    uint32_t nRotateRows = 0u;
+    uint32_t nRotateColumns = 0u;
+    bool doRotateInput = false;
+
+    uint32_t nInputs = 0u;
+    uint32_t nOutputs = 0u;
+
+    /**
+     * Convolution related setting - they are affecting output transformation
+     */
+    uint32_t nRotateOutputRows = 0u;
+    uint32_t nRotateOutputColumns = 0u;
+    bool doRotateOutput = false;
+
+    uint32_t nTransposeInputs = 0u;
+    uint32_t nTransposeOutputs = 0u;
+
+    /**
+     * Reserved Data might be here
+     */
+    ModelHeader() = default;
+    ModelHeader(GNAPluginNS::Header2dot1::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+    }
+    ModelHeader(GNAPluginNS::Header2dot4::ModelHeader const &old) {
+        gnaMemSize = old.gnaMemSize;
+        layersCount = old.layersCount;
+        nGroup = old.nGroup;
+        nRotateRows = old.nRotateRows;
+        nRotateColumns = old.nRotateColumns;
+        nInputs = old.nInputs;
+        nOutputs = old.nOutputs;
+        nRotateOutputRows = old.nRotateOutputRows;
+        nRotateOutputColumns = old.nRotateOutputColumns;
+        doRotateOutput = old.doRotateOutput;
+        version.minor = old.version.minor;
+    }
+};
+#pragma pack(pop)
+
+/*
+ * In runtime endpoint mostly same as in serial version, except of descriptor field
+ */
+struct RuntimeEndPoint {
+    /**
+     * if scale factor is different then pased into infer , network might need to be requantized
+     */
+    float scaleFactor = 0;
+    /**
+     * Pointer descriptor
+     */
+    void* descriptor_ptr = nullptr;
+    /**
+     * Endpoint resolution in bytes.
+     */
+    uint32_t element_size = 0;
+    /**
+     * Number of elements
+     */
+    uint32_t elements_count = 0;
+    /**
+     * Offset in bytes of pointer descriptor
+    */
+    uint64_t descriptor_offset = 0ull;
+
+    intel_dnn_orientation_t orientation = kDnnUnknownOrientation;
+
+    RuntimeEndPoint() = default;
+    RuntimeEndPoint(double scaleFactor,
+                    void* descriptor_ptr,
+                    uint32_t element_size,
+                    uint32_t elements_count,
+                    intel_dnn_orientation_t orientation) : scaleFactor(scaleFactor),
+                                                           descriptor_ptr(descriptor_ptr),
+                                                           element_size(element_size),
+                                                           elements_count(elements_count),
+                                                           orientation(orientation) { }
+};
+} // namespace Header2dot5
+} // namespace GNAPluginNS
--- a/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
+++ b/inference-engine/src/gna_plugin/serial/headers/latest/gna_model_header.hpp
@@ -1,14 +1,14 @@
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

 #pragma once

-#include "serial/headers/2dot4/gna_model_header.hpp"
+#include "serial/headers/2dot5/gna_model_header.hpp"

 namespace GNAPluginNS {
 namespace HeaderLatest {
-using ModelHeader = GNAPluginNS::Header2dot4::ModelHeader;
-using RuntimeEndPoint = GNAPluginNS::Header2dot4::RuntimeEndPoint;
+using ModelHeader = GNAPluginNS::Header2dot5::ModelHeader;
+using RuntimeEndPoint = GNAPluginNS::Header2dot5::RuntimeEndPoint;
 }
 }
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp
@@ -29,8 +29,8 @@ const std::vector<std::vector<size_t >> dilationsH1 = {{1, 1},
                                                       {1, 3}};
 // TODO: Currently C != 1 is not supported for graphs with native NCHW layout (will be fixed in 40496)
 const std::vector<std::vector<size_t>> inputShapesH1 = {{1, 1, 1, 32},
-                                                        {1, 1, 1, 160},
-                                                        {1, 1, 1, 64}};
+                                                        {1, 32, 1, 160},
+                                                        {1, 8, 1, 64}};
 const std::vector<std::vector<size_t >> kernelsW1 = {{3, 1},
                                                     {5, 1}};
 const std::vector<std::vector<size_t >> stridesW1 = {{1, 1},
@@ -43,8 +43,8 @@ const std::vector<std::vector<size_t >> dilationsW1 = {{1, 1},
                                                       {3, 1}};
 // TODO: Currently C != 1 is not supported for graphs with native NCHW layout (will be fixed in 40496)
 const std::vector<std::vector<size_t>> inputShapesW1 = {{1, 1, 32, 1},
-                                                        {1, 1, 160, 1},
-                                                        {1, 1, 64, 1}};
+                                                        {1, 32, 160, 1},
+                                                        {1, 8, 64, 1}};
 const std::vector<size_t> numOutCannels = {4, 8, 12};

 const std::vector<std::vector<size_t >> kernels2D = {
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/eltwise_conv_eltwise.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/eltwise_conv_eltwise.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+#include "subgraph_tests/eltwise_conv_eltwise.hpp"
+
+using namespace SubgraphTestsDefinitions;
+
+namespace {
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_FP32"}
+    },
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+std::vector<convParams> params = {
+    std::make_tuple(
+         std::vector<size_t>{1, 64},    //InputShape
+         std::vector<size_t>{1, 3},     //KernelShape
+         1),                            //Stride
+    std::make_tuple(std::vector<size_t>{1, 128}, std::vector<size_t>{1, 5}, 1),
+    std::make_tuple(std::vector<size_t>{1, 168}, std::vector<size_t>{1, 9}, 2),
+    std::make_tuple(std::vector<size_t>{1, 320}, std::vector<size_t>{1, 8}, 4)
+};
+
+std::vector<size_t> inputChannels = {
+    1,
+    4,
+    8
+};
+
+std::vector<size_t> outputChannels = {
+    4,
+    8
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseAfterConvTest, EltwiseAfterConvTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        EltwiseAfterConvTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseBeforeConvTest, EltwiseBeforeConvTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        EltwiseBeforeConvTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_EltwiseWithTwoConvsAsInputsTest, EltwiseWithTwoConvsAsInputsTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        EltwiseWithTwoConvsAsInputsTest::getTestCaseName);
+}  // namespace
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fc_conv_fc.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fc_conv_fc.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+#include "subgraph_tests/fc_conv_fc.hpp"
+
+using namespace SubgraphTestsDefinitions;
+
+namespace {
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_FP32"}
+    },
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+std::vector<convParams> params = {
+    std::make_tuple(
+         std::vector<size_t>{1, 64},    //InputShape
+         std::vector<size_t>{1, 3},     //KernelShape
+         1),                            //Stride
+    std::make_tuple(std::vector<size_t>{1, 128}, std::vector<size_t>{1, 5}, 1),
+    std::make_tuple(std::vector<size_t>{1, 168}, std::vector<size_t>{1, 3}, 2),
+    std::make_tuple(std::vector<size_t>{1, 320}, std::vector<size_t>{1, 8}, 4)
+};
+
+std::vector<size_t> inputChannels = {
+    1,
+    4,
+    8
+};
+
+std::vector<size_t> outputChannels = {
+    4,
+    8
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_FcAfterConvTest, FcAfterConvTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        FcAfterConvTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_FcBeforeConvTest, FcBeforeConvTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        FcBeforeConvTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_FcBetweenConvsTest, FcBetweenConvsTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        FcBetweenConvsTest::getTestCaseName);
+}  // namespace
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/scaleshift_conv_scaleshift.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/scaleshift_conv_scaleshift.cpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+#include "subgraph_tests/scaleshift_conv_scaleshift.hpp"
+
+using namespace SubgraphTestsDefinitions;
+
+namespace {
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_FP32"}
+    },
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+std::vector<convParams> params = {
+    std::make_tuple(
+         std::vector<size_t>{1, 64},    //InputShape
+         std::vector<size_t>{1, 3},     //KernelShape
+         1),                            //Stride
+    std::make_tuple(std::vector<size_t>{1, 128}, std::vector<size_t>{1, 5}, 1),
+    std::make_tuple(std::vector<size_t>{1, 168}, std::vector<size_t>{1, 9}, 2),
+    std::make_tuple(std::vector<size_t>{1, 320}, std::vector<size_t>{1, 8}, 4)
+};
+
+std::vector<size_t> inputChannels = {
+    1,
+    4,
+    8
+};
+
+std::vector<size_t> outputChannels = {
+    4,
+    8
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_ScaleShiftAfterConvTest, ScaleShiftAfterConvTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        ScaleShiftAfterConvTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_ScaleShiftBeforeConvTest, ScaleShiftBeforeConvTest,
+                        ::testing::Combine(
+                            ::testing::ValuesIn(netPrecisions),
+                            ::testing::Values(CommonTestUtils::DEVICE_GNA),
+                            ::testing::ValuesIn(configs),
+                            ::testing::ValuesIn(params),
+                            ::testing::ValuesIn(inputChannels),
+                            ::testing::ValuesIn(outputChannels)),
+                        ScaleShiftBeforeConvTest::getTestCaseName);
+}  // namespace
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/split_conv_concat.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/split_conv_concat.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2019-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -9,14 +9,20 @@

 using namespace SubgraphTestsDefinitions;
 const std::vector<InferenceEngine::Precision> netPrecisions = {
-        InferenceEngine::Precision::FP32,
-        InferenceEngine::Precision::FP16
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
 };
-// TODO: Issue:  26421 (Concat issue)
-INSTANTIATE_TEST_CASE_P(DISABLED_smoke_ReshapeNoReshape, SplitConvConcat,
+
+std::vector<std::vector<size_t>> inputShapes = {
+    {1, 32, 1, 130},
+    {1, 64, 1, 170},
+    {1, 32, 1, 1026}
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_SplitConvConcat, SplitConvConcat,
                        ::testing::Combine(
                                ::testing::ValuesIn(netPrecisions),
-                                ::testing::Values(std::vector<size_t >({1, 6, 40, 40})),
+                                ::testing::ValuesIn(inputShapes),
                                ::testing::Values(CommonTestUtils::DEVICE_GNA)),
                        SplitConvConcat::getTestCaseName);

--- a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/eltwise_conv_eltwise.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/eltwise_conv_eltwise.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/eltwise_conv_eltwise.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+TEST_P(EltwiseAfterConvTest, CompareWithRefImpl) {
+    LoadNetwork();
+    Infer();
+    // Create another copy of function for validation since some data will be changed by GNA plugin
+    SetUp();
+    Validate();
+};
+
+TEST_P(EltwiseBeforeConvTest, CompareWithRefImpl) {
+    LoadNetwork();
+    Infer();
+    // Create another copy of function for validation since some data will be changed by GNA plugin
+    SetUp();
+    Validate();
+};
+
+TEST_P(EltwiseWithTwoConvsAsInputsTest, CompareWithRefImpl) {
+    Run();
+};
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/fc_conv_fc.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/fc_conv_fc.hpp
@@ -0,0 +1,23 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/fc_conv_fc.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+TEST_P(FcAfterConvTest, CompareWithRefImpl) {
+    Run();
+};
+
+TEST_P(FcBeforeConvTest, CompareWithRefImpl) {
+    Run();
+};
+
+TEST_P(FcBetweenConvsTest, CompareWithRefImpl) {
+    Run();
+};
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/scaleshift_conv_scaleshift.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/subgraph_tests/scaleshift_conv_scaleshift.hpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/scaleshift_conv_scaleshift.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+TEST_P(ScaleShiftAfterConvTest, CompareWithRefImpl) {
+    LoadNetwork();
+    Infer();
+    // Create another copy of function for validation since some data will be changed by GNA plugin
+    SetUp();
+    Validate();
+};
+
+TEST_P(ScaleShiftBeforeConvTest, CompareWithRefImpl) {
+    LoadNetwork();
+    Infer();
+    // Create another copy of function for validation since some data will be changed by GNA plugin
+    SetUp();
+    Validate();
+};
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/eltwise_conv_eltwise.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/eltwise_conv_eltwise.hpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+typedef std::tuple<
+        std::vector<size_t>,                 // Input Shapes
+        std::vector<size_t>,                 // Kernel Shape
+        size_t                               // Stride
+> convParams;
+
+typedef std::tuple<
+        InferenceEngine::Precision,          // Network Precision
+        std::string,                         // Target Device
+        std::map<std::string, std::string>,  // Configuration
+        convParams,                          // Convolution Params
+        size_t,                              // Input Channels
+        size_t                               // Output Channels
+> EltwiseConvEltwiseParams;
+
+class EltwiseAfterConvTest : public testing::WithParamInterface<EltwiseConvEltwiseParams>,
+                             public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<EltwiseConvEltwiseParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+class EltwiseBeforeConvTest : public testing::WithParamInterface<EltwiseConvEltwiseParams>,
+                              public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<EltwiseConvEltwiseParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+class EltwiseWithTwoConvsAsInputsTest : public testing::WithParamInterface<EltwiseConvEltwiseParams>,
+                                        public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<EltwiseConvEltwiseParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fc_conv_fc.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fc_conv_fc.hpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+typedef std::tuple<
+        std::vector<size_t>,                 // Input Shapes
+        std::vector<size_t>,                 // Kernel Shape
+        size_t                               // Stride
+> convParams;
+
+typedef std::tuple<
+        InferenceEngine::Precision,          // Network Precision
+        std::string,                         // Target Device
+        std::map<std::string, std::string>,  // Configuration
+        convParams,                          // Convolution Params
+        size_t,                              // Input Channels
+        size_t                               // Output Channels
+> FcConvFcParams;
+
+class FcAfterConvTest : public testing::WithParamInterface<FcConvFcParams>,
+                        public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<FcConvFcParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+class FcBeforeConvTest : public testing::WithParamInterface<FcConvFcParams>,
+                         public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<FcConvFcParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+class FcBetweenConvsTest : public testing::WithParamInterface<FcConvFcParams>,
+                           public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<FcConvFcParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/scaleshift_conv_scaleshift.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/scaleshift_conv_scaleshift.hpp
@@ -0,0 +1,53 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+typedef std::tuple<
+        std::vector<size_t>,                 // Input Shapes
+        std::vector<size_t>,                 // Kernel Shape
+        size_t                               // Stride
+> convParams;
+
+typedef std::tuple<
+        InferenceEngine::Precision,          // Network Precision
+        std::string,                         // Target Device
+        std::map<std::string, std::string>,  // Configuration
+        convParams,                          // Convolution Params
+        size_t,                              // Input Channels
+        size_t                               // Output Channels
+> ScaleShiftConvScaleShiftParams;
+
+class ScaleShiftAfterConvTest : public testing::WithParamInterface<ScaleShiftConvScaleShiftParams>,
+                                public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ScaleShiftConvScaleShiftParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+class ScaleShiftBeforeConvTest : public testing::WithParamInterface<ScaleShiftConvScaleShiftParams>,
+                                 public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ScaleShiftConvScaleShiftParams> obj);
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const override;
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/convolution.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/convolution.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2019-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -54,9 +54,15 @@ void ConvolutionLayerTest::SetUp() {
    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
    auto paramOuts = ngraph::helpers::convert2OutputVector(
            ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+    std::vector<float> filter_weights;
+    if (targetDevice == CommonTestUtils::DEVICE_GNA) {
+        auto filter_size = std::accumulate(std::begin(kernel), std::end(kernel), 1, std::multiplies<size_t>());
+        filter_weights = CommonTestUtils::generate_float_numbers(convOutChannels * inputShape[1] * filter_size,
+                                                                 -0.5f, 0.5f);
+    }
    auto conv = std::dynamic_pointer_cast<ngraph::opset1::Convolution>(
            ngraph::builder::makeConvolution(paramOuts[0], ngPrc, kernel, stride, padBegin,
-                                             padEnd, dilation, padType, convOutChannels));
+                                             padEnd, dilation, padType, convOutChannels, false, filter_weights));
    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(conv)};
    function = std::make_shared<ngraph::Function>(results, params, "convolution");
 }
--- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/eltwise_conv_eltwise.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/eltwise_conv_eltwise.cpp
@@ -0,0 +1,256 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/eltwise_conv_eltwise.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+std::string EltwiseAfterConvTest::getTestCaseName(testing::TestParamInfo<EltwiseConvEltwiseParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr EltwiseAfterConvTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -2.0f, 2.0f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void EltwiseAfterConvTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    std::vector<size_t> convInputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 -0.2f, 0.2f);
+    auto conv = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterConv = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterConv };
+
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv, reshapePattern2, false);
+
+    auto scale = CommonTestUtils::generate_float_numbers(outFormShapes[1], -2.0f, 2.0f);
+    auto shift = CommonTestUtils::generate_float_numbers(outFormShapes[1], -2.0f, 2.0f);
+    auto mul_const = std::make_shared<ngraph::op::Constant>(ngPrc, outFormShapes, scale);
+    auto mul = std::make_shared<ngraph::opset1::Multiply>(reshape2, mul_const);
+    auto add_const = std::make_shared<ngraph::op::Constant>(ngPrc, outFormShapes, shift);
+    auto add = std::make_shared<ngraph::opset1::Add>(mul, add_const);
+
+    function = std::make_shared<ngraph::Function>(mul, params, "EltwiseAfterConvTest");
+}
+
+std::string EltwiseBeforeConvTest::getTestCaseName(testing::TestParamInfo<EltwiseConvEltwiseParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr EltwiseBeforeConvTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -2.0f, 2.0f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void EltwiseBeforeConvTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    auto scale = CommonTestUtils::generate_float_numbers(inputShape[1], -2.0f, 2.0f);
+    auto shift = CommonTestUtils::generate_float_numbers(inputShape[1], -2.0f, 2.0f);
+    auto mul_const = std::make_shared<ngraph::op::Constant>(ngPrc, inputShape, scale);
+    auto mul = std::make_shared<ngraph::opset1::Multiply>(params[0], mul_const);
+    auto add_const = std::make_shared<ngraph::op::Constant>(ngPrc, inputShape, shift);
+    auto add = std::make_shared<ngraph::opset1::Add>(mul, add_const);
+
+    std::vector<size_t> convInputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(mul, reshapePattern1, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 -0.2f, 0.2f);
+    auto conv = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterReshape = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterReshape };
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv, reshapePattern2, false);
+
+    function = std::make_shared<ngraph::Function>(reshape2, params, "EltwiseBeforeConvTest");
+}
+
+std::string EltwiseWithTwoConvsAsInputsTest::getTestCaseName(testing::TestParamInfo<EltwiseConvEltwiseParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr EltwiseWithTwoConvsAsInputsTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -2.0f, 2.0f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void EltwiseWithTwoConvsAsInputsTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape, inputShape });
+
+    std::vector<size_t> convInputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto filterWeights1 = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                  -0.2f, 0.2f);
+    auto conv1 = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights1);
+
+    auto widthAfterReshape = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterReshape };
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv1, reshapePattern2, false);
+
+    auto reshapePattern3 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape3 = std::make_shared<ngraph::opset1::Reshape>(params[1], reshapePattern3, false);
+
+    auto filterWeights2 = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                  -0.2f, 0.2f);
+    auto conv2 = ngraph::builder::makeConvolution(reshape3, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights2);
+
+    auto reshapePattern4 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape4 = std::make_shared<ngraph::opset1::Reshape>(conv2, reshapePattern4, false);
+
+    auto add = std::make_shared<ngraph::opset1::Add>(reshape2, reshape4);
+    function = std::make_shared<ngraph::Function>(add, params, "EltwiseWithTwoConvsAsInputsTest");
+}
+
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/fc_conv_fc.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/fc_conv_fc.cpp
@@ -0,0 +1,259 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/fc_conv_fc.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+std::string FcAfterConvTest::getTestCaseName(testing::TestParamInfo<FcConvFcParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr FcAfterConvTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -2.0f, 2.0f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void FcAfterConvTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    std::vector<size_t> convInputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 -0.1f, 0.1f);
+    auto conv = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterConv = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterConv };
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv, reshapePattern2, false);
+    auto relu1 = std::make_shared<ngraph::opset3::Relu>(reshape2);
+
+    std::vector<float> fc3_weights = CommonTestUtils::generate_float_numbers(outFormShapes[1] * outFormShapes[1], -0.1f, 0.1f);
+    auto fc3 = ngraph::builder::makeFullyConnected(relu1, ngPrc, outFormShapes[1], false, {}, fc3_weights);
+
+    auto fc4_weights = CommonTestUtils::generate_float_numbers(outFormShapes[1] * outFormShapes[1], -0.1f, 0.1f);
+    auto fc4 = ngraph::builder::makeFullyConnected(fc3, ngPrc, outFormShapes[1], false, {}, fc4_weights);
+
+    function = std::make_shared<ngraph::Function>(fc4, params, "FcAfterConvTest");
+}
+
+std::string FcBeforeConvTest::getTestCaseName(testing::TestParamInfo<FcConvFcParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr FcBeforeConvTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -0.1f, 0.1f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void FcBeforeConvTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    auto fc1_weights = CommonTestUtils::generate_float_numbers(inputShape[1] * inputShape[1], -0.1f, 0.1f);
+    auto fc1 = ngraph::builder::makeFullyConnected(params[0], ngPrc, inputShape[1], false, {}, fc1_weights);
+
+    auto fc2_weights = CommonTestUtils::generate_float_numbers(inputShape[1] * inputShape[1], -0.05f, 0.05f);
+    auto fc2 = ngraph::builder::makeFullyConnected(fc1, ngPrc, inputShape[1], false, {}, fc2_weights);
+
+    std::vector<size_t> convInputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(fc2, reshapePattern1, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 -0.1f, 0.1f);
+    auto conv = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterConv = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterConv };
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv, reshapePattern2, false);
+
+    function = std::make_shared<ngraph::Function>(reshape2, params, "FcBeforeConvTest");
+}
+
+std::string FcBetweenConvsTest::getTestCaseName(testing::TestParamInfo<FcConvFcParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr FcBetweenConvsTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -0.2f, 0.2f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void FcBetweenConvsTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    std::vector<size_t> conv1InputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, conv1InputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto filter1Weights = CommonTestUtils::generate_float_numbers(outputChannels * conv1InputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                  -0.2f, 0.2f);
+    auto conv1 = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filter1Weights);
+
+    auto widthAfterConv1 = (conv1InputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes1 = {1,  outputChannels * widthAfterConv1 };
+
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes1);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv1, reshapePattern2, false);
+    auto relu = std::make_shared<ngraph::opset3::Relu>(reshape2);
+
+    auto fc_weights = CommonTestUtils::generate_float_numbers(outFormShapes1[1] * outFormShapes1[1], -0.2f, 0.2f);
+    auto fc = ngraph::builder::makeFullyConnected(relu, ngPrc, outFormShapes1[1], false, {}, fc_weights);
+
+    std::vector<size_t> conv2InputShape = {1, outputChannels, 1, widthAfterConv1};
+    auto reshapePattern3 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, conv2InputShape);
+    auto reshape3 = std::make_shared<ngraph::opset1::Reshape>(fc, reshapePattern3, false);
+
+    auto filter2Weights = CommonTestUtils::generate_float_numbers(outputChannels * conv2InputShape[1],
+                                                                  -0.2f, 0.2f);
+    auto conv2 = ngraph::builder::makeConvolution(reshape3, ngPrc, { 1, 1 }, { 1, 1 }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filter2Weights);
+    std::vector<size_t> outFormShapes2 = {1,  outputChannels * conv2InputShape[3]};
+
+    auto reshapePattern4 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes2);
+    auto reshape4 = std::make_shared<ngraph::opset1::Reshape>(conv2, reshapePattern4, false);
+
+    function = std::make_shared<ngraph::Function>(reshape4, params, "FcBetweenConvsTest");
+}
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/scaleshift_conv_scaleshift.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/scaleshift_conv_scaleshift.cpp
@@ -0,0 +1,178 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/scaleshift_conv_scaleshift.hpp"
+#include "ngraph_functions/builders.hpp"
+
+namespace SubgraphTestsDefinitions {
+
+std::string ScaleShiftAfterConvTest::getTestCaseName(testing::TestParamInfo<ScaleShiftConvScaleShiftParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr ScaleShiftAfterConvTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -2.0f, 2.0f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void ScaleShiftAfterConvTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    std::vector<size_t> convInputShape = {1, inputChannels, 1, inputShape[0] * inputShape[1] / inputChannels};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 -0.2f, 0.2f);
+    auto conv = ngraph::builder::makeConvolution(reshape1, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterConv = (convInputShape[3] - kernelShape[1]) / stride + 1;
+
+    std::vector<size_t> outFormShapes = { 1, outputChannels * widthAfterConv, 1, 1 };
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, outFormShapes);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(conv, reshapePattern2, false);
+
+    auto scale = CommonTestUtils::generate_float_numbers(outputChannels * widthAfterConv, -2.0f, 2.0f);
+    auto shift = CommonTestUtils::generate_float_numbers(outputChannels * widthAfterConv, -2.0f, 2.0f);
+    auto mul_const = std::make_shared<ngraph::op::Constant>(ngPrc, outFormShapes, scale);
+    auto mul = std::make_shared<ngraph::opset1::Multiply>(reshape2, mul_const);
+    auto add_const = std::make_shared<ngraph::op::Constant>(ngPrc, outFormShapes, shift);
+    auto add = std::make_shared<ngraph::opset1::Add>(mul, add_const);
+
+    outFormShapes = {1,  outputChannels * widthAfterConv };
+    auto reshapePattern3 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape3 = std::make_shared<ngraph::opset1::Reshape>(add, reshapePattern3, false);
+
+    function = std::make_shared<ngraph::Function>(mul, params, "ScaleShiftAfterConvTest");
+}
+
+std::string ScaleShiftBeforeConvTest::getTestCaseName(testing::TestParamInfo<ScaleShiftConvScaleShiftParams> obj) {
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::map<std::string, std::string> configuration;
+    size_t inputChannels;
+    size_t outputChannels;
+    convParams convolutionParams;
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(netPrecision, targetDevice, configuration, convolutionParams, inputChannels, outputChannels) = obj.param;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "KS=" << CommonTestUtils::vec2str(kernelShape) << "_";
+    result << "S=" << stride << "_";
+    result << "IC=" << inputChannels << "_";
+    result << "OC=" << outputChannels << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    for (auto const& configItem : configuration) {
+        result << "_configItem=" << configItem.first << "_" << configItem.second;
+    }
+    return result.str();
+}
+
+InferenceEngine::Blob::Ptr ScaleShiftBeforeConvTest::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    InferenceEngine::Blob::Ptr blob = make_blob_with_precision(info.getTensorDesc());
+    blob->allocate();
+
+    auto* rawBlobDataPtr = blob->buffer().as<float*>();
+    std::vector<float> values = CommonTestUtils::generate_float_numbers(blob->size(), -0.1f, 0.1f);
+    for (size_t i = 0; i < blob->size(); i++) {
+        rawBlobDataPtr[i] = values[i];
+    }
+    return blob;
+}
+
+void ScaleShiftBeforeConvTest::SetUp() {
+    InferenceEngine::Precision netPrecision;
+    std::map<std::string, std::string> tempConfig;
+    convParams convolutionParams;
+    size_t inputChannels;
+    size_t outputChannels;
+    std::tie(netPrecision, targetDevice, tempConfig, convolutionParams, inputChannels, outputChannels) = this->GetParam();
+    configuration.insert(tempConfig.begin(), tempConfig.end());
+
+    std::vector<size_t> inputShape;
+    std::vector<size_t> kernelShape;
+    size_t stride;
+    std::tie(inputShape, kernelShape, stride) = convolutionParams;
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
+
+    std::vector<size_t> convInputShape = {1, inputShape[1], 1, 1};
+    auto reshapePattern1 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape1 = std::make_shared<ngraph::opset1::Reshape>(params[0], reshapePattern1, false);
+
+    auto scale = CommonTestUtils::generate_float_numbers(convInputShape[1], -2.0f, 2.0f);
+    auto shift = CommonTestUtils::generate_float_numbers(convInputShape[1], -2.0f, 2.0f);
+    auto mul_const = std::make_shared<ngraph::op::Constant>(ngPrc, convInputShape, scale);
+    auto mul = std::make_shared<ngraph::opset1::Multiply>(reshape1, mul_const);
+    auto add_const = std::make_shared<ngraph::op::Constant>(ngPrc, convInputShape, shift);
+    auto add = std::make_shared<ngraph::opset1::Add>(mul, add_const);
+
+    convInputShape = {1, inputChannels, 1, inputShape[1] / inputChannels};
+    auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 4 }, convInputShape);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(mul, reshapePattern2, false);
+
+    auto filterWeights = CommonTestUtils::generate_float_numbers(outputChannels * convInputShape[1] * kernelShape[0] * kernelShape[1],
+                                                                 -0.1f, 0.1f);
+    auto conv = ngraph::builder::makeConvolution(reshape2, ngPrc, { kernelShape[0], kernelShape[1] }, { stride, stride }, { 0, 0 },
+        { 0, 0 }, { 1, 1 }, ngraph::op::PadType::VALID, outputChannels, false, filterWeights);
+
+    auto widthAfterReshape = (convInputShape[3] - kernelShape[1]) / stride + 1;
+    std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterReshape };
+    auto reshapePattern3 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
+    auto reshape3 = std::make_shared<ngraph::opset1::Reshape>(conv, reshapePattern3, false);
+
+    function = std::make_shared<ngraph::Function>(reshape3, params, "ScaleShiftBeforeConvTest");
+}
+}  // namespace SubgraphTestsDefinitions
--- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/split_conv_concat.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/split_conv_concat.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Intel Corporation
+// Copyright (C) 2019-2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@@ -29,12 +29,18 @@ void SplitConvConcat::SetUp() {

    auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);

-    auto conv1 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
-                                                  ngraph::op::PadType::EXPLICIT, 5);
+    std::vector<float> filterWeights1;
+    std::vector<float> filterWeights2;
+    if (targetDevice == CommonTestUtils::DEVICE_GNA) {
+        filterWeights1 = CommonTestUtils::generate_float_numbers(8 * inputShape[1] / 2 * 3, -0.2f, 0.2f);
+        filterWeights2 = CommonTestUtils::generate_float_numbers(8 * inputShape[1] / 2 * 3, -0.2f, 0.2f);
+    }
+    auto conv1 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {1, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::VALID, 8, false, filterWeights1);
    auto relu1 = std::make_shared<ngraph::opset1::Relu>(conv1);

-    auto conv2 = ngraph::builder::makeConvolution(split->output(1), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
-                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv2 = ngraph::builder::makeConvolution(split->output(1), ngPrc, {1, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::VALID, 8, false, filterWeights2);
    auto relu2 = std::make_shared<ngraph::opset1::Relu>(conv2);
    auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{relu1->output(0), relu2->output(0)}, 1);