From 4e0a740eb3ee31062ba0df88fcf438564f67edb7 Mon Sep 17 00:00:00 2001 From: Elizaveta Lobanova Date: Thu, 10 Mar 2022 15:16:17 +0300 Subject: [PATCH] [GNA] Support of overload correction for MatMul with 2 non-constant layers (#10447) --- .../convert_matmul_to_fc_or_gemm.cpp | 4 +- .../intel_gna/frontend/scale_factor_calc.hpp | 112 ++++++++++++-- .../matmul_overload_correction.cpp | 141 ++++++++++++++++++ 3 files changed, 243 insertions(+), 14 deletions(-) create mode 100644 src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp diff --git a/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp b/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp index 2006b716fe3..5a7bfb05bf6 100644 --- a/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp +++ b/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp @@ -112,7 +112,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() { // we replace MatMul with FullyConnected operation. // Otherwise we replace MatMul with Gemm. auto fq_after_const = std::dynamic_pointer_cast(fc_input_b.get_node_shared_ptr()); - if ((std::dynamic_pointer_cast (fc_input_b.get_node_shared_ptr()) || fq_after_const) && + bool is_fq_after_const = fq_after_const && + std::dynamic_pointer_cast(fc_input_b.get_node_shared_ptr()->input_value(0).get_node_shared_ptr()); + if ((std::dynamic_pointer_cast (fc_input_b.get_node_shared_ptr()) || is_fq_after_const) && std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) { return x != 1; }) <= 2) { diff --git a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp index dac30e426a9..c16e6d0c88a 100644 --- a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp +++ b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp @@ -207,6 +207,86 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) { return weightsReducer; } +/** + * @brief Tries to re-quantize an input to reach the desired output scale factor value. + * This function searches for layers above which output scale factors can be changed: + * - activations, + * - constants, + * - weightable layers (output scale factor is modified by modification of weights scale factor). + * @param input input to be re-quantized + * @param newOutputScale the desired output scale factor value + * @param result information about the restarted layer + * @return true if the input can be re-quantized + */ +static bool requantizeInput(InferenceEngine::CNNLayerPtr input, float newOutputScale, ScaleFactorUpdateResult &result) { + auto layer = input; + while (layer && !LayerInfo(layer).isInput() && !LayerInfo(layer).isMemory() && !LayerInfo(layer).isCopy()) { + size_t prevInputIdx = 0; + auto info = LayerInfo(layer); + auto quantDataForInputLayer = InferenceEngine::getInjectedData(*layer); + if (quantDataForInputLayer->_dst_quant.IsStatsSet()) { + auto levels = LayerInfo(layer).has32BOutput() ? (std::numeric_limits::max() + 1ul) : + (std::numeric_limits::max() + 1ul); + auto maxSF = CalculateScaleFactorFromStats(levels, quantDataForInputLayer->_dst_quant.GetMinValues().front(), + quantDataForInputLayer->_dst_quant.GetMaxValues().front()); + if (newOutputScale > maxSF) { + gnalog() << layer->name << ": Scale factor " << newOutputScale << " is too large. The maximum scale factor: " + << maxSF << " levels=" << levels << " min=" << quantDataForInputLayer->_dst_quant.GetMinValues().front() + << " max=" << quantDataForInputLayer->_dst_quant.GetMaxValues().front() << "\n"; + return false; + } + } + if (info.isActivation() || info.isConst()) { + gnawarn() << "[WARNING] requantize " << layer->name + << ". Layer new output scale: " << newOutputScale + << ", was " << quantDataForInputLayer->_dst_quant.GetScale() << std::endl; + quantDataForInputLayer->_dst_quant.SetScale(newOutputScale); + result = ScaleFactorUpdateResult(layer.get()); + return true; + } + + if (info.isWeightableIdentity() && !fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) { + auto reducer = std::max(1.0f, quantDataForInputLayer->_dst_quant.GetScale() / newOutputScale); + auto newWeightsScale = std::max(1.0f, quantDataForInputLayer->_weights_quant.GetScale() / reducer); + quantDataForInputLayer->_weights_quant.SetScale(static_cast(newWeightsScale)); + quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() * + quantDataForInputLayer->_src_quant.GetScale()); + + result = ScaleFactorUpdateResult(layer.get()); + return true; + } + + if (info.isFullyConnected() || info.isConvolution()) { + quantDataForInputLayer->_dst_quant.SetScale(newOutputScale); + quantDataForInputLayer->_weights_quant.SetScale(newOutputScale / quantDataForInputLayer->_src_quant.GetScale()); + result = ScaleFactorUpdateResult(layer.get()); + return true; + } + + if (LayerInfo(layer).isEltwise()) { + // re-quantize bias branch for Eltwise layer + if (!LayerInfo(input).has32BOutput()) { + break; + } + + for (uint8_t ix = 0; ix < 2; ++ix) { + if (LayerInfo(InferenceEngine::CNNNetPrevLayer(layer, ix)).has32BOutput()) { + prevInputIdx = ix; + break; + } + } + auto prevLayer = InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx); + auto prevQuantData = InferenceEngine::getInjectedData(*prevLayer); + newOutputScale *= prevQuantData->_dst_quant.GetScale() / quantDataForInputLayer->_dst_quant.GetScale(); + } + + layer = InferenceEngine::CNNNetHasPrevLayer(layer.get(), prevInputIdx) ? + InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx) : nullptr; + } + + return false; +} + /** * @brief calculates output scale factor per layer * @tparam T @@ -1292,22 +1372,16 @@ public: auto quantParams0 = InferenceEngine::getInjectedData(in0); quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale()); quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale()); - if (quantData->_src_quant.IsStatsSet()) { - auto getScale = [&quantParams0](size_t i) { - return CalculateScaleFactorFromStats(quantParams0->_dst_quant.GetLevels(), - quantParams0->_dst_quant.GetMinValues(false)[i], quantParams0->_dst_quant.GetMaxValues(false)[i]); - }; - float min_channel_scale = getScale(0); - quantParams0->_dst_quant.SetScale(min_channel_scale); - quantData->_src_quant.SetScale(min_channel_scale); - } quantData->_dst_quant.SetScale( quantData->_src_quant.GetScale() * quantData->_weights_quant.GetScale()); - // If the first input is const it's possible to reduce its scale factor to avoid overflow - if (LayerInfo(in0).isConst() && quantData->_dst_quant.IsStatsSet()) { - // Adjust weights scale factor if output values exceed int32 maximum value - auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant); + if (!quantData->_dst_quant.IsStatsSet()) { + return true; + } + + // Adjust weights scale factor if output values exceed int32 maximum value + auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant); + if (LayerInfo(in0).isConst()) { if (!fp32eq(weightsReducer, 1.0f)) { quantParams0->_dst_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer); quantData->_src_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer); @@ -1318,6 +1392,18 @@ public: } quantData->_dst_quant.SetScale(quantData->_weights_quant.GetScale() * quantData->_src_quant.GetScale()); + } else { + if (!fp32eq(weightsReducer, 1.0f)) { + for (int i = 0; i < 2; ++i) { + auto input = InferenceEngine::CNNNetPrevLayer(gemmLayer, i); + auto quantParams = InferenceEngine::getInjectedData(input); + float newOutputScale = quantParams->_dst_quant.GetScale() / weightsReducer; + if (requantizeInput(input, newOutputScale, result)) { + return true; + } + } + THROW_GNA_EXCEPTION << "Unable to quantize " << gemmLayer->name; + } } return true; } diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp new file mode 100644 index 00000000000..073144fd09a --- /dev/null +++ b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp @@ -0,0 +1,141 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "shared_test_classes/base/layer_test_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" +#include "ngraph_functions/builders.hpp" + +#include "ngraph_functions/pass/convert_prc.hpp" + +typedef std::tuple< + InferenceEngine::Precision, // Network Precision + std::string, // Target Device + std::map, // Configuration + std::vector, // Input shape + bool, // Constant second input + bool // Swap inputs +> matmulOverloadCorrectionParams; + +namespace LayerTestsDefinitions { + +class MatMulOverloadCorrectionTest : public testing::WithParamInterface, + public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::map configuration; + std::vector inputShape; + bool isSecondInputConst, swapInputs; + std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = obj.param; + + std::ostringstream result; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice << "_"; + for (auto const& configItem : configuration) { + result << "_configItem=" << configItem.first << "_" << configItem.second; + } + result << "_IS=" << CommonTestUtils::vec2str(inputShape); + result << "_secondInput=" << (isSecondInputConst ? "const" : "param"); + result << "_swapInputs=" << swapInputs; + + return result.str(); + } + +protected: + void SetUp() override { + InferenceEngine::Precision netPrecision; + bool isSecondInputConst, swapInputs; + std::vector inputShape; + + std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = this->GetParam(); + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + const ngraph::Shape shape1 = inputShape; + const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]}; + const float maxInputValue = 10.0f; + auto params = ngraph::builder::makeParams(ngPrc, {shape1}); + std::shared_ptr input2; + if (isSecondInputConst) { + input2 = ngraph::builder::makeConstant(ngPrc, ngraph::Shape{shape1[1], shape1[1]}, + CommonTestUtils::generate_float_numbers(shape2[1], 0.0f, maxInputValue)); + } else { + input2 = ngraph::builder::makeInputLayer(ngPrc, ngraph::helpers::InputLayerType::PARAMETER, shape2); + params.push_back(std::dynamic_pointer_cast(input2)); + } + + auto lowNodeIn1 = ngraph::builder::makeConstant(ngPrc, {1}, { -maxInputValue }); + auto highNodeIn1 = ngraph::builder::makeConstant(ngPrc, {1}, { maxInputValue }); + auto fqIn1 = std::make_shared(params[0], lowNodeIn1, highNodeIn1, + lowNodeIn1, highNodeIn1, levels16); + + auto lowNodeIn2 = ngraph::builder::makeConstant(ngPrc, {1}, { -maxInputValue }); + auto highNodeIn2 = ngraph::builder::makeConstant(ngPrc, {1}, { maxInputValue }); + auto fqIn2 = std::make_shared(input2, lowNodeIn2, highNodeIn2, + lowNodeIn2, highNodeIn2, levels16); + + std::shared_ptr matmul_input2 = fqIn2; + if (!isSecondInputConst) { + auto pattern = std::make_shared(ngraph::element::Type_t::i64, + ngraph::Shape{ 2 }, ngraph::Shape{shape1[1], shape1[1]}); + matmul_input2 = std::make_shared(fqIn2, pattern, false); + } + + auto matmul = swapInputs ? std::dynamic_pointer_cast(ngraph::builder::makeMatMul(matmul_input2, fqIn1, false, true)) : + std::dynamic_pointer_cast(ngraph::builder::makeMatMul(fqIn1, matmul_input2, false, true)); + + auto lowNodeOut = ngraph::builder::makeConstant(ngPrc, {1}, { -maxInputValue * maxInputValue * inputShape[1] / 10 }); + auto highNodeOut = ngraph::builder::makeConstant(ngPrc, {1}, { maxInputValue * maxInputValue * inputShape[1] / 10 }); + auto fqOut = std::make_shared(matmul, lowNodeOut, highNodeOut, + lowNodeOut, highNodeOut, levels32); + + ngraph::ResultVector results{std::make_shared(fqOut)}; + function = std::make_shared(results, params, "MatMulOverloadCorrection"); + } + + const size_t levels16 = std::numeric_limits::max(); + const size_t levels32 = std::numeric_limits::max(); +}; + +TEST_P(MatMulOverloadCorrectionTest, CompareWithRefImpl) { + Run(); +}; + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector> configs = { + { + {"GNA_DEVICE_MODE", "GNA_SW_EXACT"} + } +}; + +const std::vector> inputShapes = { + {1, 128}, + {1, 256} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GNA), + ::testing::ValuesIn(configs), + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false})), + MatMulOverloadCorrectionTest::getTestCaseName); +} // namespace LayerTestsDefinitions \ No newline at end of file