[GNA] Support of overload correction for MatMul with 2 non-constant layers (#10447)

2022-03-10 15:16:17 +03:00 · 2022-03-10 15:16:17 +03:00 · 4e0a740eb3
commit 4e0a740eb3
parent 09246e2db8
3 changed files with 243 additions and 14 deletions
--- a/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
+++ b/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
@ -112,7 +112,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
        // we replace MatMul with FullyConnected operation.
        // Otherwise we replace MatMul with Gemm.
        auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
-        if ((std::dynamic_pointer_cast<opset1::Constant>    (fc_input_b.get_node_shared_ptr())  || fq_after_const) &&
+        bool is_fq_after_const = fq_after_const &&
            std::dynamic_pointer_cast<opset1::Constant>(fc_input_b.get_node_shared_ptr()->input_value(0).get_node_shared_ptr());
        if ((std::dynamic_pointer_cast<opset1::Constant>    (fc_input_b.get_node_shared_ptr())  || is_fq_after_const) &&
             std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
                return x != 1;
            }) <= 2) {
--- a/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
+++ b/src/plugins/intel_gna/frontend/scale_factor_calc.hpp
@ -207,6 +207,86 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) {
    return weightsReducer;
 }
 /**
 * @brief Tries to re-quantize an input to reach the desired output scale factor value.
 * This function searches for layers above which output scale factors can be changed:
 * - activations,
 * - constants,
 * - weightable layers (output scale factor is modified by modification of weights scale factor).
 * @param input input to be re-quantized
 * @param newOutputScale the desired output scale factor value
 * @param result information about the restarted layer
 * @return true if the input can be re-quantized
 */
 static bool requantizeInput(InferenceEngine::CNNLayerPtr input, float newOutputScale, ScaleFactorUpdateResult &result) {
    auto layer = input;
    while (layer && !LayerInfo(layer).isInput() && !LayerInfo(layer).isMemory() && !LayerInfo(layer).isCopy()) {
        size_t prevInputIdx = 0;
        auto info = LayerInfo(layer);
        auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*layer);
        if (quantDataForInputLayer->_dst_quant.IsStatsSet()) {
            auto levels = LayerInfo(layer).has32BOutput() ? (std::numeric_limits<uint32_t>::max() + 1ul) :
                (std::numeric_limits<uint16_t>::max() + 1ul);
            auto maxSF = CalculateScaleFactorFromStats(levels, quantDataForInputLayer->_dst_quant.GetMinValues().front(),
                quantDataForInputLayer->_dst_quant.GetMaxValues().front());
            if (newOutputScale > maxSF) {
                gnalog() << layer->name << ": Scale factor " << newOutputScale << " is too large. The maximum scale factor: "
                    << maxSF << " levels=" << levels << " min=" << quantDataForInputLayer->_dst_quant.GetMinValues().front()
                    << " max=" << quantDataForInputLayer->_dst_quant.GetMaxValues().front() << "\n";
                return false;
            }
        }
        if (info.isActivation() || info.isConst()) {
            gnawarn() << "[WARNING] requantize " << layer->name
                        << ". Layer new output scale: " << newOutputScale
                        << ", was " << quantDataForInputLayer->_dst_quant.GetScale() << std::endl;
            quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
            result = ScaleFactorUpdateResult(layer.get());
            return true;
        }
        if (info.isWeightableIdentity() && !fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
            auto reducer = std::max(1.0f, quantDataForInputLayer->_dst_quant.GetScale() / newOutputScale);
            auto newWeightsScale = std::max(1.0f, quantDataForInputLayer->_weights_quant.GetScale() / reducer);
            quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
            quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
                quantDataForInputLayer->_src_quant.GetScale());
            result = ScaleFactorUpdateResult(layer.get());
            return true;
        }
        if (info.isFullyConnected() || info.isConvolution()) {
            quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
            quantDataForInputLayer->_weights_quant.SetScale(newOutputScale / quantDataForInputLayer->_src_quant.GetScale());
            result = ScaleFactorUpdateResult(layer.get());
            return true;
        }
        if (LayerInfo(layer).isEltwise()) {
            // re-quantize bias branch for Eltwise layer
            if (!LayerInfo(input).has32BOutput()) {
                break;
            }
            for (uint8_t ix = 0; ix < 2; ++ix) {
                if (LayerInfo(InferenceEngine::CNNNetPrevLayer(layer, ix)).has32BOutput()) {
                    prevInputIdx = ix;
                    break;
                }
            }
            auto prevLayer = InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx);
            auto prevQuantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
            newOutputScale *= prevQuantData->_dst_quant.GetScale() / quantDataForInputLayer->_dst_quant.GetScale();
        }
        layer = InferenceEngine::CNNNetHasPrevLayer(layer.get(), prevInputIdx) ?
            InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx) : nullptr;
    }
    return false;
 }
 /**
 * @brief calculates output scale factor per layer
 * @tparam T
@ -1292,22 +1372,16 @@ public:
        auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
        quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
        quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
        if (quantData->_src_quant.IsStatsSet()) {
            auto getScale = [&quantParams0](size_t i) {
                return CalculateScaleFactorFromStats(quantParams0->_dst_quant.GetLevels(),
                    quantParams0->_dst_quant.GetMinValues(false)[i], quantParams0->_dst_quant.GetMaxValues(false)[i]);
            };
            float min_channel_scale = getScale(0);
            quantParams0->_dst_quant.SetScale(min_channel_scale);
            quantData->_src_quant.SetScale(min_channel_scale);
        }
        quantData->_dst_quant.SetScale(
                quantData->_src_quant.GetScale() * quantData->_weights_quant.GetScale());
-        // If the first input is const it's possible to reduce its scale factor to avoid overflow
+        if (!quantData->_dst_quant.IsStatsSet()) {
-        if (LayerInfo(in0).isConst() && quantData->_dst_quant.IsStatsSet()) {
+            return true;
-            // Adjust weights scale factor if output values exceed int32 maximum value
+        }
-           auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
+
        // Adjust weights scale factor if output values exceed int32 maximum value
        auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
        if (LayerInfo(in0).isConst()) {
            if (!fp32eq(weightsReducer, 1.0f)) {
                quantParams0->_dst_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
                quantData->_src_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
@ -1318,6 +1392,18 @@ public:
            }
            quantData->_dst_quant.SetScale(quantData->_weights_quant.GetScale() * quantData->_src_quant.GetScale());
        } else {
            if (!fp32eq(weightsReducer, 1.0f)) {
                for (int i = 0; i < 2; ++i) {
                    auto input = InferenceEngine::CNNNetPrevLayer(gemmLayer, i);
                    auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
                    float newOutputScale = quantParams->_dst_quant.GetScale() / weightsReducer;
                    if (requantizeInput(input, newOutputScale, result)) {
                        return true;
                    }
                }
                THROW_GNA_EXCEPTION << "Unable to quantize " << gemmLayer->name;
            }
        }
        return true;
    }
--- a/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/matmul_overload_correction.cpp
@ -0,0 +1,141 @@
 // Copyright (C) 2022 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include <vector>
 #include <memory>
 #include <tuple>
 #include <vector>
 #include <string>
 #include <ie_core.hpp>
 #include "common_test_utils/common_utils.hpp"
 #include "functional_test_utils/plugin_cache.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "functional_test_utils/blob_utils.hpp"
 #include "ngraph_functions/utils/ngraph_helpers.hpp"
 #include "ngraph_functions/builders.hpp"
 #include "ngraph_functions/pass/convert_prc.hpp"
 typedef std::tuple<
    InferenceEngine::Precision,         // Network Precision
    std::string,                        // Target Device
    std::map<std::string, std::string>, // Configuration
    std::vector<size_t>,                // Input shape
    bool,                               // Constant second input
    bool                                // Swap inputs
 > matmulOverloadCorrectionParams;
 namespace LayerTestsDefinitions {
 class MatMulOverloadCorrectionTest : public testing::WithParamInterface<matmulOverloadCorrectionParams>,
    public LayerTestsUtils::LayerTestsCommon {
 public:
    static std::string getTestCaseName(testing::TestParamInfo<matmulOverloadCorrectionParams> obj) {
        InferenceEngine::Precision netPrecision;
        std::string targetDevice;
        std::map<std::string, std::string> configuration;
        std::vector<size_t> inputShape;
        bool isSecondInputConst, swapInputs;
        std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = obj.param;
        std::ostringstream result;
        result << "netPRC=" << netPrecision.name() << "_";
        result << "targetDevice=" << targetDevice << "_";
        for (auto const& configItem : configuration) {
            result << "_configItem=" << configItem.first << "_" << configItem.second;
        }
        result << "_IS=" << CommonTestUtils::vec2str(inputShape);
        result << "_secondInput=" << (isSecondInputConst ? "const" : "param");
        result << "_swapInputs=" << swapInputs;
        return result.str();
    }
 protected:
    void SetUp() override {
        InferenceEngine::Precision netPrecision;
        bool isSecondInputConst, swapInputs;
        std::vector<size_t> inputShape;
        std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = this->GetParam();
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
        const ngraph::Shape shape1 = inputShape;
        const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
        const float maxInputValue = 10.0f;
        auto params = ngraph::builder::makeParams(ngPrc, {shape1});
        std::shared_ptr<ngraph::Node> input2;
        if (isSecondInputConst) {
            input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
                CommonTestUtils::generate_float_numbers(shape2[1], 0.0f, maxInputValue));
        } else {
            input2 = ngraph::builder::makeInputLayer(ngPrc, ngraph::helpers::InputLayerType::PARAMETER, shape2);
            params.push_back(std::dynamic_pointer_cast<ngraph::opset8::Parameter>(input2));
        }
        auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
        auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
        auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
            lowNodeIn1, highNodeIn1, levels16);
        auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
        auto highNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
        auto fqIn2 = std::make_shared<ngraph::opset8::FakeQuantize>(input2, lowNodeIn2, highNodeIn2,
            lowNodeIn2, highNodeIn2, levels16);
        std::shared_ptr<ngraph::Node> matmul_input2 = fqIn2;
        if (!isSecondInputConst) {
            auto pattern = std::make_shared<ngraph::opset8::Constant>(ngraph::element::Type_t::i64,
                    ngraph::Shape{ 2 }, ngraph::Shape{shape1[1], shape1[1]});
            matmul_input2 = std::make_shared<ngraph::opset8::Reshape>(fqIn2, pattern, false);
        }
        auto matmul = swapInputs ? std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(matmul_input2, fqIn1, false, true)) :
            std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(fqIn1, matmul_input2, false, true));
        auto lowNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue * maxInputValue * inputShape[1] / 10 });
        auto highNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue * maxInputValue * inputShape[1] / 10 });
        auto fqOut = std::make_shared<ngraph::opset8::FakeQuantize>(matmul, lowNodeOut, highNodeOut,
            lowNodeOut, highNodeOut, levels32);
        ngraph::ResultVector results{std::make_shared<ngraph::opset8::Result>(fqOut)};
        function = std::make_shared<ngraph::Function>(results, params, "MatMulOverloadCorrection");
    }
    const size_t levels16 = std::numeric_limits<uint16_t>::max();
    const size_t levels32 = std::numeric_limits<uint32_t>::max();
 };
 TEST_P(MatMulOverloadCorrectionTest, CompareWithRefImpl) {
    Run();
 };
 const std::vector<InferenceEngine::Precision> netPrecisions = {
    InferenceEngine::Precision::FP32,
    InferenceEngine::Precision::FP16
 };
 const std::vector<std::map<std::string, std::string>> configs = {
    {
        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
    }
 };
 const std::vector<std::vector<size_t>> inputShapes = {
    {1, 128},
    {1, 256}
 };
 INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
    ::testing::Combine(
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(CommonTestUtils::DEVICE_GNA),
        ::testing::ValuesIn(configs),
        ::testing::ValuesIn(inputShapes),
        ::testing::ValuesIn({true, false}),
        ::testing::ValuesIn({true, false})),
    MatMulOverloadCorrectionTest::getTestCaseName);
 } // namespace LayerTestsDefinitions