From 4e0a740eb3ee31062ba0df88fcf438564f67edb7 Mon Sep 17 00:00:00 2001
From: Elizaveta Lobanova <elizaveta.lobanova@intel.com>
Date: Thu, 10 Mar 2022 15:16:17 +0300
Subject: [PATCH] [GNA] Support of overload correction for MatMul with 2
 non-constant layers (#10447)

---
 .../convert_matmul_to_fc_or_gemm.cpp          |   4 +-
 .../intel_gna/frontend/scale_factor_calc.hpp  | 112 ++++++++++++--
 .../matmul_overload_correction.cpp            | 141 ++++++++++++++++++
 3 files changed, 243 insertions(+), 14 deletions(-)
 create mode 100644 src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp

diff --git a/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp b/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
index 2006b716fe3..5a7bfb05bf6 100644
--- a/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
+++ b/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
@@ -112,7 +112,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
         // we replace MatMul with FullyConnected operation.
         // Otherwise we replace MatMul with Gemm.
         auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
-        if ((std::dynamic_pointer_cast<opset1::Constant>    (fc_input_b.get_node_shared_ptr())  || fq_after_const) &&
+        bool is_fq_after_const = fq_after_const &&
+            std::dynamic_pointer_cast<opset1::Constant>(fc_input_b.get_node_shared_ptr()->input_value(0).get_node_shared_ptr());
+        if ((std::dynamic_pointer_cast<opset1::Constant>    (fc_input_b.get_node_shared_ptr())  || is_fq_after_const) &&
              std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
                 return x != 1;
             }) <= 2) {
diff --git a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
index dac30e426a9..c16e6d0c88a 100644
--- a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
+++ b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
@@ -207,6 +207,86 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) {
     return weightsReducer;
 }
 
+/**
+ * @brief Tries to re-quantize an input to reach the desired output scale factor value.
+ * This function searches for layers above which output scale factors can be changed:
+ * - activations,
+ * - constants,
+ * - weightable layers (output scale factor is modified by modification of weights scale factor).
+ * @param input input to be re-quantized
+ * @param newOutputScale the desired output scale factor value
+ * @param result information about the restarted layer
+ * @return true if the input can be re-quantized
+ */
+static bool requantizeInput(InferenceEngine::CNNLayerPtr input, float newOutputScale, ScaleFactorUpdateResult &result) {
+    auto layer = input;
+    while (layer && !LayerInfo(layer).isInput() && !LayerInfo(layer).isMemory() && !LayerInfo(layer).isCopy()) {
+        size_t prevInputIdx = 0;
+        auto info = LayerInfo(layer);
+        auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*layer);
+        if (quantDataForInputLayer->_dst_quant.IsStatsSet()) {
+            auto levels = LayerInfo(layer).has32BOutput() ? (std::numeric_limits<uint32_t>::max() + 1ul) :
+                (std::numeric_limits<uint16_t>::max() + 1ul);
+            auto maxSF = CalculateScaleFactorFromStats(levels, quantDataForInputLayer->_dst_quant.GetMinValues().front(),
+                quantDataForInputLayer->_dst_quant.GetMaxValues().front());
+            if (newOutputScale > maxSF) {
+                gnalog() << layer->name << ": Scale factor " << newOutputScale << " is too large. The maximum scale factor: "
+                    << maxSF << " levels=" << levels << " min=" << quantDataForInputLayer->_dst_quant.GetMinValues().front()
+                    << " max=" << quantDataForInputLayer->_dst_quant.GetMaxValues().front() << "\n";
+                return false;
+            }
+        }
+        if (info.isActivation() || info.isConst()) {
+            gnawarn() << "[WARNING] requantize " << layer->name
+                        << ". Layer new output scale: " << newOutputScale
+                        << ", was " << quantDataForInputLayer->_dst_quant.GetScale() << std::endl;
+            quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
+            result = ScaleFactorUpdateResult(layer.get());
+            return true;
+        }
+
+        if (info.isWeightableIdentity() && !fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
+            auto reducer = std::max(1.0f, quantDataForInputLayer->_dst_quant.GetScale() / newOutputScale);
+            auto newWeightsScale = std::max(1.0f, quantDataForInputLayer->_weights_quant.GetScale() / reducer);
+            quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
+            quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
+                quantDataForInputLayer->_src_quant.GetScale());
+
+            result = ScaleFactorUpdateResult(layer.get());
+            return true;
+        }
+
+        if (info.isFullyConnected() || info.isConvolution()) {
+            quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
+            quantDataForInputLayer->_weights_quant.SetScale(newOutputScale / quantDataForInputLayer->_src_quant.GetScale());
+            result = ScaleFactorUpdateResult(layer.get());
+            return true;
+        }
+
+        if (LayerInfo(layer).isEltwise()) {
+            // re-quantize bias branch for Eltwise layer
+            if (!LayerInfo(input).has32BOutput()) {
+                break;
+            }
+
+            for (uint8_t ix = 0; ix < 2; ++ix) {
+                if (LayerInfo(InferenceEngine::CNNNetPrevLayer(layer, ix)).has32BOutput()) {
+                    prevInputIdx = ix;
+                    break;
+                }
+            }
+            auto prevLayer = InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx);
+            auto prevQuantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
+            newOutputScale *= prevQuantData->_dst_quant.GetScale() / quantDataForInputLayer->_dst_quant.GetScale();
+        }
+
+        layer = InferenceEngine::CNNNetHasPrevLayer(layer.get(), prevInputIdx) ?
+            InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx) : nullptr;
+    }
+
+    return false;
+}
+
 /**
  * @brief calculates output scale factor per layer
  * @tparam T
@@ -1292,22 +1372,16 @@ public:
         auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
         quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
         quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
-        if (quantData->_src_quant.IsStatsSet()) {
-            auto getScale = [&quantParams0](size_t i) {
-                return CalculateScaleFactorFromStats(quantParams0->_dst_quant.GetLevels(),
-                    quantParams0->_dst_quant.GetMinValues(false)[i], quantParams0->_dst_quant.GetMaxValues(false)[i]);
-            };
-            float min_channel_scale = getScale(0);
-            quantParams0->_dst_quant.SetScale(min_channel_scale);
-            quantData->_src_quant.SetScale(min_channel_scale);
-        }
         quantData->_dst_quant.SetScale(
                 quantData->_src_quant.GetScale() * quantData->_weights_quant.GetScale());
 
-        // If the first input is const it's possible to reduce its scale factor to avoid overflow
-        if (LayerInfo(in0).isConst() && quantData->_dst_quant.IsStatsSet()) {
-            // Adjust weights scale factor if output values exceed int32 maximum value
-           auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
+        if (!quantData->_dst_quant.IsStatsSet()) {
+            return true;
+        }
+
+        // Adjust weights scale factor if output values exceed int32 maximum value
+        auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
+        if (LayerInfo(in0).isConst()) {
             if (!fp32eq(weightsReducer, 1.0f)) {
                 quantParams0->_dst_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
                 quantData->_src_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
@@ -1318,6 +1392,18 @@ public:
             }
 
             quantData->_dst_quant.SetScale(quantData->_weights_quant.GetScale() * quantData->_src_quant.GetScale());
+        } else {
+            if (!fp32eq(weightsReducer, 1.0f)) {
+                for (int i = 0; i < 2; ++i) {
+                    auto input = InferenceEngine::CNNNetPrevLayer(gemmLayer, i);
+                    auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
+                    float newOutputScale = quantParams->_dst_quant.GetScale() / weightsReducer;
+                    if (requantizeInput(input, newOutputScale, result)) {
+                        return true;
+                    }
+                }
+                THROW_GNA_EXCEPTION << "Unable to quantize " << gemmLayer->name;
+            }
         }
         return true;
     }
diff --git a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
new file mode 100644
index 00000000000..073144fd09a
--- /dev/null
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
@@ -0,0 +1,141 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+#include "ngraph_functions/pass/convert_prc.hpp"
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::vector<size_t>,                // Input shape
+    bool,                               // Constant second input
+    bool                                // Swap inputs
+> matmulOverloadCorrectionParams;
+
+namespace LayerTestsDefinitions {
+
+class MatMulOverloadCorrectionTest : public testing::WithParamInterface<matmulOverloadCorrectionParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<matmulOverloadCorrectionParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::vector<size_t> inputShape;
+        bool isSecondInputConst, swapInputs;
+        std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_IS=" << CommonTestUtils::vec2str(inputShape);
+        result << "_secondInput=" << (isSecondInputConst ? "const" : "param");
+        result << "_swapInputs=" << swapInputs;
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+        bool isSecondInputConst, swapInputs;
+        std::vector<size_t> inputShape;
+
+        std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = this->GetParam();
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        const ngraph::Shape shape1 = inputShape;
+        const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
+        const float maxInputValue = 10.0f;
+        auto params = ngraph::builder::makeParams(ngPrc, {shape1});
+        std::shared_ptr<ngraph::Node> input2;
+        if (isSecondInputConst) {
+            input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
+                CommonTestUtils::generate_float_numbers(shape2[1], 0.0f, maxInputValue));
+        } else {
+            input2 = ngraph::builder::makeInputLayer(ngPrc, ngraph::helpers::InputLayerType::PARAMETER, shape2);
+            params.push_back(std::dynamic_pointer_cast<ngraph::opset8::Parameter>(input2));
+        }
+
+        auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
+        auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
+        auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
+            lowNodeIn1, highNodeIn1, levels16);
+
+        auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
+        auto highNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
+        auto fqIn2 = std::make_shared<ngraph::opset8::FakeQuantize>(input2, lowNodeIn2, highNodeIn2,
+            lowNodeIn2, highNodeIn2, levels16);
+
+        std::shared_ptr<ngraph::Node> matmul_input2 = fqIn2;
+        if (!isSecondInputConst) {
+            auto pattern = std::make_shared<ngraph::opset8::Constant>(ngraph::element::Type_t::i64,
+                    ngraph::Shape{ 2 }, ngraph::Shape{shape1[1], shape1[1]});
+            matmul_input2 = std::make_shared<ngraph::opset8::Reshape>(fqIn2, pattern, false);
+        }
+
+        auto matmul = swapInputs ? std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(matmul_input2, fqIn1, false, true)) :
+            std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(fqIn1, matmul_input2, false, true));
+
+        auto lowNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue * maxInputValue * inputShape[1] / 10 });
+        auto highNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue * maxInputValue * inputShape[1] / 10 });
+        auto fqOut = std::make_shared<ngraph::opset8::FakeQuantize>(matmul, lowNodeOut, highNodeOut,
+            lowNodeOut, highNodeOut, levels32);
+
+        ngraph::ResultVector results{std::make_shared<ngraph::opset8::Result>(fqOut)};
+        function = std::make_shared<ngraph::Function>(results, params, "MatMulOverloadCorrection");
+    }
+
+    const size_t levels16 = std::numeric_limits<uint16_t>::max();
+    const size_t levels32 = std::numeric_limits<uint32_t>::max();
+};
+
+TEST_P(MatMulOverloadCorrectionTest, CompareWithRefImpl) {
+    Run();
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+const std::vector<std::vector<size_t>> inputShapes = {
+    {1, 128},
+    {1, 256}
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(inputShapes),
+        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({true, false})),
+    MatMulOverloadCorrectionTest::getTestCaseName);
+} // namespace LayerTestsDefinitions
\ No newline at end of file