[GNA] Remove extra FQ layers from the final network (#10599)

* [GNA] Fuse all FakeQuantize layers with their previous layers * [GNA] Fuse FQ with previous layer if it's not required for precision change * [GNA] Fixed MatMulOverloadCorrectionTest
2022-03-31 13:21:27 +03:00 · 2022-03-31 13:21:27 +03:00 · 3578ee9c3f
commit 3578ee9c3f
parent 79e3272237
7 changed files with 76 additions and 66 deletions
--- a/src/plugins/intel_gna/backend/make_pwl.cpp
+++ b/src/plugins/intel_gna/backend/make_pwl.cpp
@ -229,7 +229,7 @@ void make_gna_pwl(const DnnActivation&  fun,
            int32_t x_upper = INT32_MAX;
            int16_t y_lower = y_min;
            int16_t y_upper = y_max;
-            if (fun == kActFakeQuantize && fun.fqParams.set) {
+            if ((fun == kActFakeQuantize || fun == kActIdentity) && fun.fqParams.set) {
                x_lower = std::max(static_cast<int64_t>(*fun.fqParams.input_low * in_scale), static_cast<int64_t>(x_lower));
                x_upper = std::min(static_cast<int64_t>(*fun.fqParams.input_high * in_scale), static_cast<int64_t>(x_upper));
                y_lower = std::max(static_cast<int32_t>(*fun.fqParams.input_low * out_scale), static_cast<int32_t>(y_lower));
@ -253,7 +253,7 @@ void make_gna_pwl(const DnnActivation&  fun,
                        x_upper = FLOAT_TO_INT32(y_upper  * in_scale / out_scale);
                    }
                }
-            } else if (fun == kActIdentity) {
+            } else if (fun == kActIdentity && !fun.fqParams.set) {
                if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
                if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
                if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
--- a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
+++ b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
@ -538,7 +538,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer*, QUANT_DESC> {
            auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
            auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));

-            result = CalculateScaleFactorFromStats(quantizedParams->_dst_quant.GetLevels(), minOutValue, maxOutValue);
+            auto levels = std::min(quantizedParams->_dst_quant.GetLevels(), static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1);
+            result = CalculateScaleFactorFromStats(levels, minOutValue, maxOutValue);
            if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
                result = max_activation_scale_factor;
            }
--- a/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp
+++ b/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp
@ -74,7 +74,8 @@ static const char softSignLayersCounter[] = "numSoftSignLayers";
 static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
                                       InferenceEngine::CNNLayerPtr nextLayer,
                                       std::shared_ptr<IPassManager> passmanager,
-                                       float fillValue) {
+                                       float fillValue,
+                                       size_t in_data_idx = invalid_data_idx) {
    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
    auto diagName = std::string("SyntheticScaleShift_") + std::to_string(passmanager->getIntVar(diagonalLayersCounterName)++);
    gnalog() << "Inserted Diagonal Layer " << diagName <<" between: " << prevLayer->name << " and " << nextLayer->name << "\n" << std::flush;
@ -104,7 +105,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
    getCreatorLayer(dataPtr) = diagonalWithQuant;
    diagonalWithQuant->outData.push_back(dataPtr);
    // actual insertion
-    CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant);
+    CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant, invalid_data_idx, in_data_idx);
 }

 /**
@ -137,7 +138,8 @@ static bool hasNextFuncLayer(const CNNLayerPtr layer) {
            });
 }

-static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager) {
+static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager,
+                                                                  bool skipFq = false) {
    std::vector<CNNLayerPtr> prevLayers;

    // skipping memory inputs and true inputs layers
@ -146,9 +148,9 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
    auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
    auto concat = dynamic_cast<InferenceEngine::ConcatLayer *>(l.get());

-    auto PrevFunctionalLayer = [](CNNLayerPtr l, int idx = 0) {
-        auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [](CNNLayerPtr ptr) {
-            return LayerInfo(ptr).isNonFunctional();
+    auto PrevFunctionalLayer = [skipFq](CNNLayerPtr l, int idx = 0) {
+        auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [skipFq](CNNLayerPtr ptr) {
+            return LayerInfo(ptr).isNonFunctional() || skipFq && LayerInfo(ptr).isFakeQuantize();
        });
        gnalog() << "CNNNetPrevLayerSkipCertain for :: " << l->name << "returned: " << prevLayer->name << std::endl;
        return prevLayer;
@ -308,7 +310,7 @@ void InsertDiagonalLayerPass::run() {
                continue;
        }
        auto prevDirectLayer = CNNNetPrevLayer(l, 0);
-        insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f);
+        insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f, 0);
    }
 }

@ -820,18 +822,6 @@ void InsertIdentityLayerPass::run() {

            gnalog() << "Inserted "<< identityLayer->name << " between: " << prev->name << " and " << true_layer->name << "\n" << std::flush;

-            // wether 1 identity or all outputs TODO possible grouping here, need to implement special grouped inserter
-            bool notAll = false;
-            for (auto && nextData  : prev->outData) {
-                for (auto && nextLayer : getInputTo(nextData)) {
-                    if (nextLayer.second.get() == l.get())
-                        continue;
-                    if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty() &&
-                        hasNextFuncLayer(nextLayer.second)) {
-                        notAll = true;
-                    }
-                }
-            }
            // copy offset - to be used while connecting outputs
            if (prev->params.find("output_offset") != prev->params.end()) {
                identityLayer->params["output_offset"] = prev->params["output_offset"];
@ -841,7 +831,7 @@ void InsertIdentityLayerPass::run() {
                identityLayer->params["original_num_rows"] = prev->params["original_num_rows"];
            }

-            CNNNetworkInsertLayer(prev, notAll ? true_layer : CNNLayerPtr(nullptr), identityLayer);
+            CNNNetworkInsertLayer(prev, CNNLayerPtr(nullptr), identityLayer);
        }
    }
 }
@ -1341,6 +1331,12 @@ void InsertSplitAligningFilterPass::run() {
                if (getInputTo(splitOutput).empty()) {
                    gnalog() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n";
                } else {
+                    auto lastDimSize = GetDataDimSize(splitOutput, 1);
+                    if (lastDimSize != outputSize) {
+                        THROW_GNA_EXCEPTION << l->name << " Convolution Filter doesn't support these input dimensions: lastDimSize="
+                            << lastDimSize << ", outputSize=" << outputSize;
+                    }
+
                    // this split output not beginning from 64 bytes aligned boundary - need to correct by aligning filter layer
                    // insert the filter
                    auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++);
@ -2054,32 +2050,25 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        return false;
    };

-    auto allowFQFuse = [](CNNLayerPtr layer) -> bool {
-        auto doNotSkip = [](CNNLayerPtr layer) {
-            return false;
+    auto allowFQFuse = [this](CNNLayerPtr layer) -> bool {
+        auto skipNonFunctionalOrMemory = [](CNNLayerPtr layer) {
+            return LayerInfo(layer).isNonFunctional() || LayerInfo(layer).isMemory();
        };
-
-        if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkip).empty()) {
-            return false;
-        }
-
        auto skipNonFunctional = [](CNNLayerPtr layer) {
            return LayerInfo(layer).isNonFunctional();
        };
-
-        auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional);
-        if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isMemory()) {
-            return true;
+        // Don't fuse FQ if it's the output layer for the network
+        if (CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctionalOrMemory).empty()) {
+            return false;
        }
-
+        // Fuse FQ if it's not required to change precision from int32 to int16
        auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional);
        for (auto& l : nextLayers) {
-            if (!LayerInfo(l).isActivation()) {
-                return false;
+            if (getCandidatesForIdentityInsertion(l, getPassManager(), true).empty()) {
+                return true;
            }
        }
-
-        return true;
+        return false;
    };

    std::function<void(QuantizedLayerParams*, CNNLayerPtr)> propagateStatistics =
@ -2213,8 +2202,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
        }

        // Allow FQ Fuse checks if FQ layer can be fused to a layer before or after.
-        // FQ Layer is fused only when previous layer is const, memory or activation layer
-        // or a next layer is activation layer.
+        // FQ Layer is fused if it's not required for precision change.
        bool isFQFuseAllowed = allowFQFuse(l);
        auto prevData = *prevDataIt;

--- a/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp
@ -93,7 +93,7 @@ protected:
        const ngraph::Shape shape = {1, 128};
        auto params = ngraph::builder::makeParams(ngPrc, {shape});

-        auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMin });
+        auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * -inputDataMax });
        auto highNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMax });
        auto fqIn = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn, highNodeIn,
            lowNodeIn, highNodeIn, levels16);
--- a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
@ -67,6 +67,8 @@ protected:
        const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
        const float maxInputValue = 10.0f;
        auto params = ngraph::builder::makeParams(ngPrc, {shape1});
+        auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
+
        std::shared_ptr<ngraph::Node> input2;
        if (isSecondInputConst) {
            input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
@ -78,7 +80,7 @@ protected:

        auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
        auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
-        auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
+        auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(relu, lowNodeIn1, highNodeIn1,
            lowNodeIn1, highNodeIn1, levels16);

        auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
@ -138,4 +140,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
        ::testing::ValuesIn({true, false}),
        ::testing::ValuesIn({true, false})),
    MatMulOverloadCorrectionTest::getTestCaseName);
-} // namespace LayerTestsDefinitions
+} // namespace LayerTestsDefinitions
--- a/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp
@ -56,22 +56,23 @@ protected:

        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+        auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
        auto fq1 = std::make_shared<ngraph::opset8::FakeQuantize>(
-            params[0],
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            255);
+            relu,
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
        auto constant = ngraph::builder::makeConstant(ngPrc, constantShape, std::vector<float>{}, true);
        auto fq2 = std::make_shared<ngraph::opset8::FakeQuantize>(
            constant,
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
-            255);
-        auto concat = ngraph::builder::makeConcat({fq1, fq2}, 0);
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
+            ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
+            static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
+        auto concat = ngraph::builder::makeConcat({fq1, fq2}, 1);
        function = std::make_shared<ngraph::Function>(concat, params, "WeighableLayerWithoutFq");
    }
 }; // class WeighableLayerWithoutFqTest
@ -91,7 +92,7 @@ const std::vector<std::vector<size_t>> inputShapes = {
 };

 const std::vector<std::vector<size_t>> constantShapes = {
-    {{16, 5}}
+    {{1, 16}}
 };

 const std::vector<std::map<std::string, std::string>> configs = {
--- a/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp
+++ b/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp
@ -27,15 +27,33 @@ void MultipleInputTest::SetUp() {
    std::tie(targetDevice, netPrecision, inputSize, config) = this->GetParam();
    configuration.insert(config.begin(), config.end());
    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+    const float minInput = -10.0;
+    const float maxInput = 10.0;
    auto input = ngraph::builder::makeParams(ngPrc, {{1, inputSize}, {1, inputSize}, {1, inputSize}});
-    auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
-    auto mul1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
-    auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
-    auto mul2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
-    auto mul3 = ngraph::builder::makeEltwise(mul1, mul2, ngraph::helpers::EltwiseTypes::ADD);
-    auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
-    auto mul4 = ngraph::builder::makeEltwise(fake3, mul3, ngraph::helpers::EltwiseTypes::ADD);
-    auto result = std::make_shared<ngraph::opset7::Result>(mul4);
+    auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { minInput }, { maxInput }, { minInput }, { maxInput });
+    auto add1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add1 = ngraph::builder::makeFakeQuantize(add1, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
+
+    auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { minInput }, { maxInput }, { minInput }, { maxInput });
+    auto add2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add2 = ngraph::builder::makeFakeQuantize(add2, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
+
+    auto add3 = ngraph::builder::makeEltwise(fake_add1, fake_add2, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add3 = ngraph::builder::makeFakeQuantize(add3, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 4 * minInput }, { 4 * maxInput }, { 4 * minInput }, { 4 * maxInput });
+
+    auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { minInput }, { maxInput }, { minInput }, { maxInput });
+    auto add4 = ngraph::builder::makeEltwise(fake3, fake_add3, ngraph::helpers::EltwiseTypes::ADD);
+    auto fake_add4 = ngraph::builder::makeFakeQuantize(add4, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
+        { 5 * minInput }, { 5 * maxInput }, { 5 * minInput }, { 5 * maxInput });
+
+    auto result = std::make_shared<ngraph::opset7::Result>(fake_add4);
    function = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, input, "multiple_input");
 }