diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index 76e2f81940e..541bd142c3e 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -453,6 +453,12 @@ void GNAGraphCompiler::finalizeConvolution1DPrimitive(InferenceEngine::CNNLayerP
     size_t num_data_bytes_in = (num_inputs + num_input_padding) * inputs->getPrecision().size();
 
     auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input;
+    // Skip FakeQuantize and ScaleShift between Convolution and Input
+    if (LayerInfo(connectedInputLayer).isFakeQuantize()) {
+            connectedInputLayer = CNNNetPrevLayerSkipCertain(connectedInputLayer, 0, [](CNNLayerPtr l) {
+            return LayerInfo(l).isScaleShift();
+        });
+    }
 
     // TODO: convolution might be not the first layer in sorted order but connected via split for example - dont know how kaldi will handle that
     if (!dnn->do_rotate_input) {
@@ -626,6 +632,7 @@ void GNAGraphCompiler::finalizeConvolution2DPrimitive(InferenceEngine::CNNLayerP
         ptr_weights,
         ptr_biases);
 
+    currentComponent.num_bytes_per_input = inputs->getPrecision().size();
     currentComponent.num_bytes_per_output = outputs->getPrecision().size();
 
     if (inputs->getLayout() == Layout::NHWC) {
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index df0a71fc1ce..a1f7e003dc7 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -56,6 +56,8 @@
 #include <transformations/common_optimizations/relu_fake_quantize_fusion.hpp>
 #include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
 
+#include "transformations/remove_extra_reshapes.hpp"
+
 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
 
@@ -663,6 +665,7 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
         manager.register_pass<ngraph::pass::ConvertOpSet3ToOpSet2>();
         manager.register_pass<ngraph::pass::ConvertOpSet2ToOpSet1>();
         manager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
+        manager.register_pass<RemoveExtraReshapes>();
         // UnrollTI should be the last transformation in the transformation pipeline
         manager.register_pass<ngraph::pass::UnrollTensorIterator>();
 
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index 0a7a4a44e02..4c40692d239 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -371,19 +371,21 @@ namespace {
 
 void ReorderMaxPoolPass::run() {
     // detecting following pattern
-    // conv->relu->maxpooling
-    // changing it to conv->maxpooling->relu
+    // conv->activation->maxpooling
+    // changing it to conv->maxpooling->activation
     for (auto & l : *pLayers) {
         auto pool = LayerInfo(l);
         if (!pool.isMaxPooling()) continue;
 
         // don't reorder if pooling is 2D for CNN2D
         auto pooling = dynamic_cast<PoolingLayer*>(l.get());
-        if (pooling == nullptr || (is2D(pooling->_kernel) || is2D(pooling->_stride))) continue;
+        // todo: return the check for stride after it'll be fixed in MO for Kaldi models
+        if (pooling == nullptr || (is2D(pooling->_kernel))) continue;
 
         // checking prev layer type
-        auto activation = LayerInfo(CNNNetPrevLayer(l));
-        if (!activation.isActivation()) continue;
+        auto actLayer = CNNNetPrevLayer(l);
+        auto activation = LayerInfo(actLayer);
+        if (!activation.isActivation() || actLayer->insData.size() > 1) continue;
 
         // if activation came from convolution
         auto convolution = LayerInfo(CNNNetPrevLayer(static_cast<InferenceEngine::CNNLayer*>(activation)));
diff --git a/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp b/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp
new file mode 100644
index 00000000000..cbb4cb625d0
--- /dev/null
+++ b/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.cpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "transformations/remove_extra_reshapes.hpp"
+
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+
+using namespace GNAPluginNS;
+
+NGRAPH_RTTI_DEFINITION(RemoveExtraReshapes, "RemoveExtraReshapes", 0);
+
+RemoveExtraReshapes::RemoveExtraReshapes() {
+    const auto reshape = ngraph::pattern::wrap_type<ngraph::opset1::Reshape>();
+    const auto pooling = ngraph::pattern::wrap_type<ngraph::opset1::MaxPool>({reshape});
+
+    ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher &m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        const auto reshape_node = pattern_map.at(reshape).get_node_shared_ptr();
+        if (reshape_node->get_input_shape(0) != reshape_node->get_output_shape(0)) {
+            return false;
+        }
+
+        ngraph::replace_output_update_name(reshape_node->output(0), reshape_node->input_value(0));
+        return true;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(pooling, "RemoveExtraReshapes");
+    this->register_matcher(m, callback);
+}
diff --git a/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.hpp b/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.hpp
new file mode 100644
index 00000000000..4f189abdba5
--- /dev/null
+++ b/inference-engine/src/gna_plugin/transformations/remove_extra_reshapes.hpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+namespace GNAPluginNS {
+
+/**
+ * @brief Removes reshapes before MaxPool which do nothing. Such reshapes can be a result of conversion from IR10 to IR7.
+ */
+class RemoveExtraReshapes : public ngraph::pass::MatcherPass {
+public:
+  NGRAPH_RTTI_DECLARATION;
+  RemoveExtraReshapes();
+};
+
+} // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/fq_maxpool_reordering.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/fq_maxpool_reordering.cpp
new file mode 100644
index 00000000000..316df2ca9d7
--- /dev/null
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/fq_maxpool_reordering.cpp
@@ -0,0 +1,148 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+#include "ngraph_functions/pass/convert_prc.hpp"
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::vector<size_t>,                // Input Shape
+    std::pair<float, float>,            // Input Min and Max
+    size_t                              // Levels
+> fqMaxpoolReorderingParams;
+
+namespace LayerTestsDefinitions {
+
+class FQMaxpoolReordering : public testing::WithParamInterface<fqMaxpoolReorderingParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+    float inputDataMin = 0.0f;
+    float inputDataMax = 0.0f;
+    float inputDataResolution = 1.0f;
+
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<fqMaxpoolReorderingParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::vector<size_t> inputShape;
+        std::pair<float, float> inputMinMax;
+        size_t levels = 0;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax, levels) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
+        result << "_inputMinMax=(" << inputMinMax.first << ".." << inputMinMax.second << ")";
+        result << "_levels=" << levels;
+
+        return result.str();
+    }
+
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const {
+        return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin, 1 / inputDataResolution);
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+
+        std::vector<size_t> inputShape;
+        std::pair<float, float> inputMinMax;
+        size_t levels = 0;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax, levels) = this->GetParam();
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        auto inputLowNode = ngraph::builder::makeConstant<float>(ngPrc, {1}, { inputMinMax.first });
+        auto inputHighNode = ngraph::builder::makeConstant<float>(ngPrc, {1}, { inputMinMax.second });
+
+        auto inputVector = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+        auto inputFQ = std::make_shared<ngraph::opset1::FakeQuantize>(inputVector[0],
+            inputLowNode, inputHighNode, inputLowNode, inputHighNode, levels);
+
+        auto filterWeightsNode = ngraph::builder::makeConstant<float>(ngPrc, {8, inputShape[1], 1, 8}, { 1.0f });
+        auto convLowNode = ngraph::builder::makeConstant(ngraph::element::f32, std::vector<size_t>{ 1 }, std::vector<float>{inputDataMin});
+        auto convHighNode = ngraph::builder::makeConstant(ngraph::element::f32, std::vector<size_t>{ 1 }, std::vector<float>{inputDataMax});
+        auto convWeightsFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(filterWeightsNode,
+            convLowNode, convHighNode, convLowNode, convHighNode, levels);
+        auto convWeightsFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(convWeightsFQNode);
+
+        auto conv = std::make_shared<ngraph::opset1::Convolution>(inputFQ, convWeightsFQ, std::vector<size_t>{ 1, 1 },
+                                                                std::vector<ptrdiff_t>{ 0, 0 }, std::vector<ptrdiff_t>{ 0, 0 },
+                                                                std::vector<size_t>{ 1, 1 },
+                                                                ngraph::op::PadType::VALID);
+        auto biasesWeightsNode = ngraph::builder::makeConstant(ngPrc, {}, std::vector<float>{ 0.0f });
+        auto add = std::make_shared<ngraph::opset1::Add>(conv, biasesWeightsNode);
+
+        auto convFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(add,
+            inputLowNode, inputHighNode, inputLowNode, inputHighNode, levels);
+
+        auto maxpool = ngraph::builder::makePooling(convFQNode, {1, 2}, {0, 0}, {0, 0}, {1, 2}, ngraph::op::RoundingType::FLOOR,
+                                                    ngraph::op::PadType::VALID, false, ngraph::helpers::PoolingTypes::MAX);
+
+        ngraph::ResultVector results{ std::make_shared<ngraph::opset1::Result>(maxpool)};
+        function = std::make_shared<ngraph::Function>(results, inputVector, "FQMaxPoolReorder");
+    }
+};
+
+TEST_P(FQMaxpoolReordering, CompareWithRefImpl) {
+    Run();
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
+    }
+};
+
+const std::vector<std::vector<size_t>> inputShape = {
+    {1, 1, 1, 1024},
+    {1, 8, 1, 168},
+};
+
+const std::vector<std::pair<float, float>> inputMinMax = {
+    {-0.5, 0.5},
+    {-2, 2},
+    {-8, 8}
+};
+
+const std::vector<size_t> levels = {
+    65535,
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_fq_maxpool_reordering, FQMaxpoolReordering,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(inputShape),
+        ::testing::ValuesIn(inputMinMax),
+        ::testing::ValuesIn(levels)),
+    FQMaxpoolReordering::getTestCaseName);
+} // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fq_conv_fq_affine.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fq_conv_fq_affine.cpp
index 28f414ee11d..e48d4ad12c0 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fq_conv_fq_affine.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/fq_conv_fq_affine.cpp
@@ -46,10 +46,13 @@ const auto convParams = ::testing::Combine(
         ::testing::ValuesIn(outputChannels)
 );
 
+const std::vector<bool> permute = {false, true};
+
 INSTANTIATE_TEST_CASE_P(smoke_FqConvFqAffineTest, FqConvFqAffineTest,
                         ::testing::Combine(
                                 fqParams,
                                 convParams,
+                                ::testing::ValuesIn(permute),
                                 ::testing::ValuesIn(netPrecisions),
                                 ::testing::ValuesIn(inputShapes),
                                 ::testing::Values(CommonTestUtils::DEVICE_GNA),
diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fq_conv_fq_affine.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fq_conv_fq_affine.hpp
index a8ca812b749..30c014dd498 100644
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fq_conv_fq_affine.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/fq_conv_fq_affine.hpp
@@ -30,6 +30,7 @@ typedef std::tuple<
 typedef std::tuple<
         FqSpecificParams,
         ConvParams,
+        bool,                              // Permute after convolution
         InferenceEngine::Precision,        // Net precision
         InferenceEngine::SizeVector,       // Input shapes
         LayerTestsUtils::TargetDevice,     // Device name
diff --git a/inference-engine/tests/functional/shared_test_classes/src/subgraph/fq_conv_fq_affine.cpp b/inference-engine/tests/functional/shared_test_classes/src/subgraph/fq_conv_fq_affine.cpp
index cd4370d9661..6255b41db01 100644
--- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/fq_conv_fq_affine.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/fq_conv_fq_affine.cpp
@@ -9,11 +9,12 @@ namespace SubgraphTestsDefinitions {
 std::string FqConvFqAffineTest::getTestCaseName(testing::TestParamInfo<FqConvFqAffineTestParamsSet> obj) {
     FqSpecificParams fqParams;
     ConvParams convParams;
+    bool permute;
     InferenceEngine::Precision netPrecision;
     InferenceEngine::SizeVector inputShapes;
     std::string targetDevice;
     std::map<std::string, std::string> config;
-    std::tie(fqParams, convParams, netPrecision, inputShapes, targetDevice, config) = obj.param;
+    std::tie(fqParams, convParams, permute, netPrecision, inputShapes, targetDevice, config) = obj.param;
 
     std::vector<size_t> levels;
     std::vector<float> inputArg;
@@ -39,17 +40,19 @@ std::string FqConvFqAffineTest::getTestCaseName(testing::TestParamInfo<FqConvFqA
     result << "_KERNEL=" << CommonTestUtils::vec2str(kernelShape) << "_";
     result << "STRIDES=" << CommonTestUtils::vec2str(strides) << "_";
     result << "IC=" << inputChannels << "_";
-    result << "OC=" << outputChannels;
+    result << "OC=" << outputChannels << "_";
+    result << "permute=" << permute << "\n";
     return result.str();
 }
 
 void FqConvFqAffineTest::SetUp() {
     FqSpecificParams fqParams;
     ConvParams convParams;
+    bool permute;
     std::vector<size_t> inputShape;
     std::map<std::string, std::string> config;
     auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
-    std::tie(fqParams, convParams, netPrecision, inputShape, targetDevice, config) = this->GetParam();
+    std::tie(fqParams, convParams, permute, netPrecision, inputShape, targetDevice, config) = this->GetParam();
     configuration.insert(config.begin(), config.end());
 
     std::vector<size_t> levels;
@@ -100,8 +103,19 @@ void FqConvFqAffineTest::SetUp() {
     auto heightAfterConv = (convInputShape[2] - kernelShape[0]) / strides[0] + 1;
     std::vector<size_t> outFormShapes = {1,  outputChannels * widthAfterConv * heightAfterConv };
 
+    ngraph::Output<ngraph::Node> nodeBeforeReshape;
+    if (permute) {
+        auto permuteOrder = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64,
+                                                                       ngraph::Shape{4},
+                                                                       ngraph::Shape{{0, 3, 2, 1}});
+        auto transpose = std::make_shared<ngraph::opset1::Transpose>(add, permuteOrder);
+        nodeBeforeReshape = transpose;
+    } else {
+        nodeBeforeReshape = add;
+    }
+
     auto reshapePattern2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::Type_t::i64, ngraph::Shape{ 2 }, outFormShapes);
-    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(add, reshapePattern2, false);
+    auto reshape2 = std::make_shared<ngraph::opset1::Reshape>(nodeBeforeReshape, reshapePattern2, false);
 
     auto matMulWeightsNode = ngraph::builder::makeConstant<float>(ngPrc, {outFormShapes[1], outFormShapes[1]}, { 1.0f });
     auto matMulLowNode = ngraph::builder::makeConstant(ngraph::element::f32, std::vector<size_t>{ 1 }, std::vector<float>{inputDataMin});