From 3578ee9c3f04bbf3c7d8c582c86065a34b846f5b Mon Sep 17 00:00:00 2001
From: Elizaveta Lobanova <elizaveta.lobanova@intel.com>
Date: Thu, 31 Mar 2022 13:21:27 +0300
Subject: [PATCH] [GNA] Remove extra FQ layers from the final network (#10599)

* [GNA] Fuse all FakeQuantize layers with their previous layers

* [GNA] Fuse FQ with previous layer if it's not required for precision change

* [GNA] Fixed MatMulOverloadCorrectionTest
---
 src/plugins/intel_gna/backend/make_pwl.cpp    |  4 +-
 .../intel_gna/frontend/scale_factor_calc.hpp  |  3 +-
 .../intel_gna/optimizer/gna_pass_manager.cpp  | 66 ++++++++-----------
 .../scale_factors_tests/eltwise_act_fq.cpp    |  2 +-
 .../matmul_overload_correction.cpp            |  6 +-
 .../weighable_layer_without_fq.cpp            | 27 ++++----
 .../src/subgraph/multiple_input_fq.cpp        | 34 +++++++---
 7 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/src/plugins/intel_gna/backend/make_pwl.cpp b/src/plugins/intel_gna/backend/make_pwl.cpp
index 6362cee472a..38c263082e0 100644
--- a/src/plugins/intel_gna/backend/make_pwl.cpp
+++ b/src/plugins/intel_gna/backend/make_pwl.cpp
@@ -229,7 +229,7 @@ void make_gna_pwl(const DnnActivation&  fun,
             int32_t x_upper = INT32_MAX;
             int16_t y_lower = y_min;
             int16_t y_upper = y_max;
-            if (fun == kActFakeQuantize && fun.fqParams.set) {
+            if ((fun == kActFakeQuantize || fun == kActIdentity) && fun.fqParams.set) {
                 x_lower = std::max(static_cast<int64_t>(*fun.fqParams.input_low * in_scale), static_cast<int64_t>(x_lower));
                 x_upper = std::min(static_cast<int64_t>(*fun.fqParams.input_high * in_scale), static_cast<int64_t>(x_upper));
                 y_lower = std::max(static_cast<int32_t>(*fun.fqParams.input_low * out_scale), static_cast<int32_t>(y_lower));
@@ -253,7 +253,7 @@ void make_gna_pwl(const DnnActivation&  fun,
                         x_upper = FLOAT_TO_INT32(y_upper  * in_scale / out_scale);
                     }
                 }
-            } else if (fun == kActIdentity) {
+            } else if (fun == kActIdentity && !fun.fqParams.set) {
                 if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
                 if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
                 if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
diff --git a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
index 2f97459257f..1ef90832b8d 100644
--- a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
+++ b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
@@ -538,7 +538,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer*, QUANT_DESC> {
             auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
             auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
 
-            result = CalculateScaleFactorFromStats(quantizedParams->_dst_quant.GetLevels(), minOutValue, maxOutValue);
+            auto levels = std::min(quantizedParams->_dst_quant.GetLevels(), static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1);
+            result = CalculateScaleFactorFromStats(levels, minOutValue, maxOutValue);
             if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
                 result = max_activation_scale_factor;
             }
diff --git a/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp b/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp
index ef759b492b9..7ca2ba615b4 100644
--- a/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp
+++ b/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp
@@ -74,7 +74,8 @@ static const char softSignLayersCounter[] = "numSoftSignLayers";
 static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
                                        InferenceEngine::CNNLayerPtr nextLayer,
                                        std::shared_ptr<IPassManager> passmanager,
-                                       float fillValue) {
+                                       float fillValue,
+                                       size_t in_data_idx = invalid_data_idx) {
     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
     auto diagName = std::string("SyntheticScaleShift_") + std::to_string(passmanager->getIntVar(diagonalLayersCounterName)++);
     gnalog() << "Inserted Diagonal Layer " << diagName <<" between: " << prevLayer->name << " and " << nextLayer->name << "\n" << std::flush;
@@ -104,7 +105,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
     getCreatorLayer(dataPtr) = diagonalWithQuant;
     diagonalWithQuant->outData.push_back(dataPtr);
     // actual insertion
-    CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant);
+    CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant, invalid_data_idx, in_data_idx);
 }
 
 /**
@@ -137,7 +138,8 @@ static bool hasNextFuncLayer(const CNNLayerPtr layer) {
             });
 }
 
-static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager) {
+static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager,
+                                                                  bool skipFq = false) {
     std::vector<CNNLayerPtr> prevLayers;
 
     // skipping memory inputs and true inputs layers
@@ -146,9 +148,9 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
     auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
     auto concat = dynamic_cast<InferenceEngine::ConcatLayer *>(l.get());
 
-    auto PrevFunctionalLayer = [](CNNLayerPtr l, int idx = 0) {
-        auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [](CNNLayerPtr ptr) {
-            return LayerInfo(ptr).isNonFunctional();
+    auto PrevFunctionalLayer = [skipFq](CNNLayerPtr l, int idx = 0) {
+        auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [skipFq](CNNLayerPtr ptr) {
+            return LayerInfo(ptr).isNonFunctional() || skipFq && LayerInfo(ptr).isFakeQuantize();
         });
         gnalog() << "CNNNetPrevLayerSkipCertain for :: " << l->name << "returned: " << prevLayer->name << std::endl;
         return prevLayer;
@@ -308,7 +310,7 @@ void InsertDiagonalLayerPass::run() {
                 continue;
         }
         auto prevDirectLayer = CNNNetPrevLayer(l, 0);
-        insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f);
+        insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f, 0);
     }
 }
 
@@ -820,18 +822,6 @@ void InsertIdentityLayerPass::run() {
 
             gnalog() << "Inserted "<< identityLayer->name << " between: " << prev->name << " and " << true_layer->name << "\n" << std::flush;
 
-            // wether 1 identity or all outputs TODO possible grouping here, need to implement special grouped inserter
-            bool notAll = false;
-            for (auto && nextData  : prev->outData) {
-                for (auto && nextLayer : getInputTo(nextData)) {
-                    if (nextLayer.second.get() == l.get())
-                        continue;
-                    if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty() &&
-                        hasNextFuncLayer(nextLayer.second)) {
-                        notAll = true;
-                    }
-                }
-            }
             // copy offset - to be used while connecting outputs
             if (prev->params.find("output_offset") != prev->params.end()) {
                 identityLayer->params["output_offset"] = prev->params["output_offset"];
@@ -841,7 +831,7 @@ void InsertIdentityLayerPass::run() {
                 identityLayer->params["original_num_rows"] = prev->params["original_num_rows"];
             }
 
-            CNNNetworkInsertLayer(prev, notAll ? true_layer : CNNLayerPtr(nullptr), identityLayer);
+            CNNNetworkInsertLayer(prev, CNNLayerPtr(nullptr), identityLayer);
         }
     }
 }
@@ -1341,6 +1331,12 @@ void InsertSplitAligningFilterPass::run() {
                 if (getInputTo(splitOutput).empty()) {
                     gnalog() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n";
                 } else {
+                    auto lastDimSize = GetDataDimSize(splitOutput, 1);
+                    if (lastDimSize != outputSize) {
+                        THROW_GNA_EXCEPTION << l->name << " Convolution Filter doesn't support these input dimensions: lastDimSize="
+                            << lastDimSize << ", outputSize=" << outputSize;
+                    }
+
                     // this split output not beginning from 64 bytes aligned boundary - need to correct by aligning filter layer
                     // insert the filter
                     auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++);
@@ -2054,32 +2050,25 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
         return false;
     };
 
-    auto allowFQFuse = [](CNNLayerPtr layer) -> bool {
-        auto doNotSkip = [](CNNLayerPtr layer) {
-            return false;
+    auto allowFQFuse = [this](CNNLayerPtr layer) -> bool {
+        auto skipNonFunctionalOrMemory = [](CNNLayerPtr layer) {
+            return LayerInfo(layer).isNonFunctional() || LayerInfo(layer).isMemory();
         };
-
-        if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkip).empty()) {
-            return false;
-        }
-
         auto skipNonFunctional = [](CNNLayerPtr layer) {
             return LayerInfo(layer).isNonFunctional();
         };
-
-        auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional);
-        if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isMemory()) {
-            return true;
+        // Don't fuse FQ if it's the output layer for the network
+        if (CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctionalOrMemory).empty()) {
+            return false;
         }
-
+        // Fuse FQ if it's not required to change precision from int32 to int16
         auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional);
         for (auto& l : nextLayers) {
-            if (!LayerInfo(l).isActivation()) {
-                return false;
+            if (getCandidatesForIdentityInsertion(l, getPassManager(), true).empty()) {
+                return true;
             }
         }
-
-        return true;
+        return false;
     };
 
     std::function<void(QuantizedLayerParams*, CNNLayerPtr)> propagateStatistics =
@@ -2213,8 +2202,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
         }
 
         // Allow FQ Fuse checks if FQ layer can be fused to a layer before or after.
-        // FQ Layer is fused only when previous layer is const, memory or activation layer
-        // or a next layer is activation layer.
+        // FQ Layer is fused if it's not required for precision change.
         bool isFQFuseAllowed = allowFQFuse(l);
         auto prevData = *prevDataIt;
 
diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp
index 71602113d04..bbf6944212d 100644
--- a/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp
@@ -93,7 +93,7 @@ protected:
         const ngraph::Shape shape = {1, 128};
         auto params = ngraph::builder::makeParams(ngPrc, {shape});
 
-        auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMin });
+        auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * -inputDataMax });
         auto highNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMax });
         auto fqIn = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn, highNodeIn,
             lowNodeIn, highNodeIn, levels16);
diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
index 073144fd09a..32ed52d44f1 100644
--- a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
@@ -67,6 +67,8 @@ protected:
         const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
         const float maxInputValue = 10.0f;
         auto params = ngraph::builder::makeParams(ngPrc, {shape1});
+        auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
+
         std::shared_ptr<ngraph::Node> input2;
         if (isSecondInputConst) {
             input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
@@ -78,7 +80,7 @@ protected:
 
         auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
         auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
-        auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
+        auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(relu, lowNodeIn1, highNodeIn1,
             lowNodeIn1, highNodeIn1, levels16);
 
         auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
@@ -138,4 +140,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
         ::testing::ValuesIn({true, false}),
         ::testing::ValuesIn({true, false})),
     MatMulOverloadCorrectionTest::getTestCaseName);
-} // namespace LayerTestsDefinitions
\ No newline at end of file
+} // namespace LayerTestsDefinitions
diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp
index 96facb4bc4c..e1066d9b782 100644
--- a/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp
@@ -56,22 +56,23 @@ protected:
 
         auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
         auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+        auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
         auto fq1 = std::make_shared<ngraph::opset8::FakeQuantize>(
-            params[0],
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            255);
+            relu,
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
         auto constant = ngraph::builder::makeConstant(ngPrc, constantShape, std::vector<float>{}, true);
         auto fq2 = std::make_shared<ngraph::opset8::FakeQuantize>(
             constant,
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            255);
-        auto concat = ngraph::builder::makeConcat({fq1, fq2}, 0);
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
+        auto concat = ngraph::builder::makeConcat({fq1, fq2}, 1);
         function = std::make_shared<ngraph::Function>(concat, params, "WeighableLayerWithoutFq");
     }
 }; // class WeighableLayerWithoutFqTest
@@ -91,7 +92,7 @@ const std::vector<std::vector<size_t>> inputShapes = {
 };
 
 const std::vector<std::vector<size_t>> constantShapes = {
-    {{16, 5}}
+    {{1, 16}}
 };
 
 const std::vector<std::map<std::string, std::string>> configs = {
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp b/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp
index 367ceed7029..f63305f694c 100644
--- a/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp
+++ b/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp
@@ -27,15 +27,33 @@ void MultipleInputTest::SetUp() {
     std::tie(targetDevice, netPrecision, inputSize, config) = this->GetParam();
     configuration.insert(config.begin(), config.end());
     auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+    const float minInput = -10.0;
+    const float maxInput = 10.0;
     auto input = ngraph::builder::makeParams(ngPrc, {{1, inputSize}, {1, inputSize}, {1, inputSize}});
-    auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
-    auto mul1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
-    auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
-    auto mul2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
-    auto mul3 = ngraph::builder::makeEltwise(mul1, mul2, ngraph::helpers::EltwiseTypes::ADD);
-    auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
-    auto mul4 = ngraph::builder::makeEltwise(fake3, mul3, ngraph::helpers::EltwiseTypes::ADD);
-    auto result = std::make_shared<ngraph::opset7::Result>(mul4);
+    auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { minInput }, { maxInput }, { minInput }, { maxInput });
+    auto add1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add1 = ngraph::builder::makeFakeQuantize(add1, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
+
+    auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { minInput }, { maxInput }, { minInput }, { maxInput });
+    auto add2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add2 = ngraph::builder::makeFakeQuantize(add2, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
+
+    auto add3 = ngraph::builder::makeEltwise(fake_add1, fake_add2, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add3 = ngraph::builder::makeFakeQuantize(add3, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 4 * minInput }, { 4 * maxInput }, { 4 * minInput }, { 4 * maxInput });
+
+    auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { minInput }, { maxInput }, { minInput }, { maxInput });
+    auto add4 = ngraph::builder::makeEltwise(fake3, fake_add3, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add4 = ngraph::builder::makeFakeQuantize(add4, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 5 * minInput }, { 5 * maxInput }, { 5 * minInput }, { 5 * maxInput });
+
+    auto result = std::make_shared<ngraph::opset7::Result>(fake_add4);
     function = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, input, "multiple_input");
 }