From 3578ee9c3f04bbf3c7d8c582c86065a34b846f5b Mon Sep 17 00:00:00 2001 From: Elizaveta Lobanova Date: Thu, 31 Mar 2022 13:21:27 +0300 Subject: [PATCH] [GNA] Remove extra FQ layers from the final network (#10599) * [GNA] Fuse all FakeQuantize layers with their previous layers * [GNA] Fuse FQ with previous layer if it's not required for precision change * [GNA] Fixed MatMulOverloadCorrectionTest --- src/plugins/intel_gna/backend/make_pwl.cpp | 4 +- .../intel_gna/frontend/scale_factor_calc.hpp | 3 +- .../intel_gna/optimizer/gna_pass_manager.cpp | 66 ++++++++----------- .../scale_factors_tests/eltwise_act_fq.cpp | 2 +- .../matmul_overload_correction.cpp | 6 +- .../weighable_layer_without_fq.cpp | 27 ++++---- .../src/subgraph/multiple_input_fq.cpp | 34 +++++++--- 7 files changed, 76 insertions(+), 66 deletions(-) diff --git a/src/plugins/intel_gna/backend/make_pwl.cpp b/src/plugins/intel_gna/backend/make_pwl.cpp index 6362cee472a..38c263082e0 100644 --- a/src/plugins/intel_gna/backend/make_pwl.cpp +++ b/src/plugins/intel_gna/backend/make_pwl.cpp @@ -229,7 +229,7 @@ void make_gna_pwl(const DnnActivation& fun, int32_t x_upper = INT32_MAX; int16_t y_lower = y_min; int16_t y_upper = y_max; - if (fun == kActFakeQuantize && fun.fqParams.set) { + if ((fun == kActFakeQuantize || fun == kActIdentity) && fun.fqParams.set) { x_lower = std::max(static_cast(*fun.fqParams.input_low * in_scale), static_cast(x_lower)); x_upper = std::min(static_cast(*fun.fqParams.input_high * in_scale), static_cast(x_upper)); y_lower = std::max(static_cast(*fun.fqParams.input_low * out_scale), static_cast(y_lower)); @@ -253,7 +253,7 @@ void make_gna_pwl(const DnnActivation& fun, x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale); } } - } else if (fun == kActIdentity) { + } else if (fun == kActIdentity && !fun.fqParams.set) { if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale); if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); diff --git a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp index 2f97459257f..1ef90832b8d 100644 --- a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp +++ b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp @@ -538,7 +538,8 @@ class ScaleFactorPerLayer { auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front(); auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue)); - result = CalculateScaleFactorFromStats(quantizedParams->_dst_quant.GetLevels(), minOutValue, maxOutValue); + auto levels = std::min(quantizedParams->_dst_quant.GetLevels(), static_cast(std::numeric_limits::max()) + 1); + result = CalculateScaleFactorFromStats(levels, minOutValue, maxOutValue); if (std::isinf(result) || fp32eq(absMax, 0.0f)) { result = max_activation_scale_factor; } diff --git a/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp b/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp index ef759b492b9..7ca2ba615b4 100644 --- a/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp +++ b/src/plugins/intel_gna/optimizer/gna_pass_manager.cpp @@ -74,7 +74,8 @@ static const char softSignLayersCounter[] = "numSoftSignLayers"; static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer, InferenceEngine::CNNLayerPtr nextLayer, std::shared_ptr passmanager, - float fillValue) { + float fillValue, + size_t in_data_idx = invalid_data_idx) { auto quantized = InferenceEngine::getInjectedData(prevLayer); auto diagName = std::string("SyntheticScaleShift_") + std::to_string(passmanager->getIntVar(diagonalLayersCounterName)++); gnalog() << "Inserted Diagonal Layer " << diagName <<" between: " << prevLayer->name << " and " << nextLayer->name << "\n" << std::flush; @@ -104,7 +105,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer, getCreatorLayer(dataPtr) = diagonalWithQuant; diagonalWithQuant->outData.push_back(dataPtr); // actual insertion - CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant); + CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant, invalid_data_idx, in_data_idx); } /** @@ -137,7 +138,8 @@ static bool hasNextFuncLayer(const CNNLayerPtr layer) { }); } -static std::vector getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr passmanager) { +static std::vector getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr passmanager, + bool skipFq = false) { std::vector prevLayers; // skipping memory inputs and true inputs layers @@ -146,9 +148,9 @@ static std::vector getCandidatesForIdentityInsertion(const CNNLayer auto eltwise = dynamic_cast(l.get()); auto concat = dynamic_cast(l.get()); - auto PrevFunctionalLayer = [](CNNLayerPtr l, int idx = 0) { - auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [](CNNLayerPtr ptr) { - return LayerInfo(ptr).isNonFunctional(); + auto PrevFunctionalLayer = [skipFq](CNNLayerPtr l, int idx = 0) { + auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [skipFq](CNNLayerPtr ptr) { + return LayerInfo(ptr).isNonFunctional() || skipFq && LayerInfo(ptr).isFakeQuantize(); }); gnalog() << "CNNNetPrevLayerSkipCertain for :: " << l->name << "returned: " << prevLayer->name << std::endl; return prevLayer; @@ -308,7 +310,7 @@ void InsertDiagonalLayerPass::run() { continue; } auto prevDirectLayer = CNNNetPrevLayer(l, 0); - insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f); + insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f, 0); } } @@ -820,18 +822,6 @@ void InsertIdentityLayerPass::run() { gnalog() << "Inserted "<< identityLayer->name << " between: " << prev->name << " and " << true_layer->name << "\n" << std::flush; - // wether 1 identity or all outputs TODO possible grouping here, need to implement special grouped inserter - bool notAll = false; - for (auto && nextData : prev->outData) { - for (auto && nextLayer : getInputTo(nextData)) { - if (nextLayer.second.get() == l.get()) - continue; - if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty() && - hasNextFuncLayer(nextLayer.second)) { - notAll = true; - } - } - } // copy offset - to be used while connecting outputs if (prev->params.find("output_offset") != prev->params.end()) { identityLayer->params["output_offset"] = prev->params["output_offset"]; @@ -841,7 +831,7 @@ void InsertIdentityLayerPass::run() { identityLayer->params["original_num_rows"] = prev->params["original_num_rows"]; } - CNNNetworkInsertLayer(prev, notAll ? true_layer : CNNLayerPtr(nullptr), identityLayer); + CNNNetworkInsertLayer(prev, CNNLayerPtr(nullptr), identityLayer); } } } @@ -1341,6 +1331,12 @@ void InsertSplitAligningFilterPass::run() { if (getInputTo(splitOutput).empty()) { gnalog() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n"; } else { + auto lastDimSize = GetDataDimSize(splitOutput, 1); + if (lastDimSize != outputSize) { + THROW_GNA_EXCEPTION << l->name << " Convolution Filter doesn't support these input dimensions: lastDimSize=" + << lastDimSize << ", outputSize=" << outputSize; + } + // this split output not beginning from 64 bytes aligned boundary - need to correct by aligning filter layer // insert the filter auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++); @@ -2054,32 +2050,25 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { return false; }; - auto allowFQFuse = [](CNNLayerPtr layer) -> bool { - auto doNotSkip = [](CNNLayerPtr layer) { - return false; + auto allowFQFuse = [this](CNNLayerPtr layer) -> bool { + auto skipNonFunctionalOrMemory = [](CNNLayerPtr layer) { + return LayerInfo(layer).isNonFunctional() || LayerInfo(layer).isMemory(); }; - - if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkip).empty()) { - return false; - } - auto skipNonFunctional = [](CNNLayerPtr layer) { return LayerInfo(layer).isNonFunctional(); }; - - auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional); - if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isMemory()) { - return true; + // Don't fuse FQ if it's the output layer for the network + if (CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctionalOrMemory).empty()) { + return false; } - + // Fuse FQ if it's not required to change precision from int32 to int16 auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional); for (auto& l : nextLayers) { - if (!LayerInfo(l).isActivation()) { - return false; + if (getCandidatesForIdentityInsertion(l, getPassManager(), true).empty()) { + return true; } } - - return true; + return false; }; std::function propagateStatistics = @@ -2213,8 +2202,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { } // Allow FQ Fuse checks if FQ layer can be fused to a layer before or after. - // FQ Layer is fused only when previous layer is const, memory or activation layer - // or a next layer is activation layer. + // FQ Layer is fused if it's not required for precision change. bool isFQFuseAllowed = allowFQFuse(l); auto prevData = *prevDataIt; diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp index 71602113d04..bbf6944212d 100644 --- a/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp +++ b/src/tests/functional/plugin/gna/scale_factors_tests/eltwise_act_fq.cpp @@ -93,7 +93,7 @@ protected: const ngraph::Shape shape = {1, 128}; auto params = ngraph::builder::makeParams(ngPrc, {shape}); - auto lowNodeIn = ngraph::builder::makeConstant(ngPrc, {1}, { 100 * inputDataMin }); + auto lowNodeIn = ngraph::builder::makeConstant(ngPrc, {1}, { 100 * -inputDataMax }); auto highNodeIn = ngraph::builder::makeConstant(ngPrc, {1}, { 100 * inputDataMax }); auto fqIn = std::make_shared(params[0], lowNodeIn, highNodeIn, lowNodeIn, highNodeIn, levels16); diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp index 073144fd09a..32ed52d44f1 100644 --- a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp +++ b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp @@ -67,6 +67,8 @@ protected: const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]}; const float maxInputValue = 10.0f; auto params = ngraph::builder::makeParams(ngPrc, {shape1}); + auto relu = std::make_shared(params[0]); + std::shared_ptr input2; if (isSecondInputConst) { input2 = ngraph::builder::makeConstant(ngPrc, ngraph::Shape{shape1[1], shape1[1]}, @@ -78,7 +80,7 @@ protected: auto lowNodeIn1 = ngraph::builder::makeConstant(ngPrc, {1}, { -maxInputValue }); auto highNodeIn1 = ngraph::builder::makeConstant(ngPrc, {1}, { maxInputValue }); - auto fqIn1 = std::make_shared(params[0], lowNodeIn1, highNodeIn1, + auto fqIn1 = std::make_shared(relu, lowNodeIn1, highNodeIn1, lowNodeIn1, highNodeIn1, levels16); auto lowNodeIn2 = ngraph::builder::makeConstant(ngPrc, {1}, { -maxInputValue }); @@ -138,4 +140,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest, ::testing::ValuesIn({true, false}), ::testing::ValuesIn({true, false})), MatMulOverloadCorrectionTest::getTestCaseName); -} // namespace LayerTestsDefinitions \ No newline at end of file +} // namespace LayerTestsDefinitions diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp index 96facb4bc4c..e1066d9b782 100644 --- a/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp +++ b/src/tests/functional/plugin/gna/scale_factors_tests/weighable_layer_without_fq.cpp @@ -56,22 +56,23 @@ protected: auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, {inputShape}); + auto relu = std::make_shared(params[0]); auto fq1 = std::make_shared( - params[0], - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - 255); + relu, + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}), + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}), + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}), + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}), + static_cast(std::numeric_limits::max()) + 1); auto constant = ngraph::builder::makeConstant(ngPrc, constantShape, std::vector{}, true); auto fq2 = std::make_shared( constant, - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1}), - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}), - 255); - auto concat = ngraph::builder::makeConcat({fq1, fq2}, 0); + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10}), + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}), + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}), + ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}), + static_cast(std::numeric_limits::max()) + 1); + auto concat = ngraph::builder::makeConcat({fq1, fq2}, 1); function = std::make_shared(concat, params, "WeighableLayerWithoutFq"); } }; // class WeighableLayerWithoutFqTest @@ -91,7 +92,7 @@ const std::vector> inputShapes = { }; const std::vector> constantShapes = { - {{16, 5}} + {{1, 16}} }; const std::vector> configs = { diff --git a/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp b/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp index 367ceed7029..f63305f694c 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/multiple_input_fq.cpp @@ -27,15 +27,33 @@ void MultipleInputTest::SetUp() { std::tie(targetDevice, netPrecision, inputSize, config) = this->GetParam(); configuration.insert(config.begin(), config.end()); auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + const float minInput = -10.0; + const float maxInput = 10.0; auto input = ngraph::builder::makeParams(ngPrc, {{1, inputSize}, {1, inputSize}, {1, inputSize}}); - auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 }); - auto mul1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD); - auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 }); - auto mul2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD); - auto mul3 = ngraph::builder::makeEltwise(mul1, mul2, ngraph::helpers::EltwiseTypes::ADD); - auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 }); - auto mul4 = ngraph::builder::makeEltwise(fake3, mul3, ngraph::helpers::EltwiseTypes::ADD); - auto result = std::make_shared(mul4); + auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, std::numeric_limits::max(), { 1 }, + { minInput }, { maxInput }, { minInput }, { maxInput }); + auto add1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD); + auto fake_add1 = ngraph::builder::makeFakeQuantize(add1, ngPrc, std::numeric_limits::max(), { 1 }, + { 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput }); + + auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, std::numeric_limits::max(), { 1 }, + { minInput }, { maxInput }, { minInput }, { maxInput }); + auto add2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD); + auto fake_add2 = ngraph::builder::makeFakeQuantize(add2, ngPrc, std::numeric_limits::max(), { 1 }, + { 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput }); + + auto add3 = ngraph::builder::makeEltwise(fake_add1, fake_add2, ngraph::helpers::EltwiseTypes::ADD); + auto fake_add3 = ngraph::builder::makeFakeQuantize(add3, ngPrc, std::numeric_limits::max(), { 1 }, + { 4 * minInput }, { 4 * maxInput }, { 4 * minInput }, { 4 * maxInput }); + + auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, std::numeric_limits::max(), { 1 }, + { minInput }, { maxInput }, { minInput }, { maxInput }); + auto add4 = ngraph::builder::makeEltwise(fake3, fake_add3, ngraph::helpers::EltwiseTypes::ADD); + auto fake_add4 = ngraph::builder::makeFakeQuantize(add4, ngPrc, std::numeric_limits::max(), { 1 }, + { 5 * minInput }, { 5 * maxInput }, { 5 * minInput }, { 5 * maxInput }); + + auto result = std::make_shared(fake_add4); function = std::make_shared(ngraph::ResultVector{result}, input, "multiple_input"); }