[GNA] Fixed a bug with channel multiplier for int8 weights quantization (#9234)

2021-12-20 18:12:56 +03:00 · 2021-12-20 18:12:56 +03:00 · d4fdbba7d8
commit d4fdbba7d8
parent 1901f33bd3
3 changed files with 157 additions and 4 deletions
--- a/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
+++ b/src/common/legacy/src/transformations/convert_opset1_to_legacy/convert_matmul_to_fc_or_gemm.cpp
@ -111,9 +111,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
        // Check that if second inputs is Constant operation and it's shape without ones dimensions has length <= 2
        // we replace MatMul with FullyConnected operation.
        // Otherwise we replace MatMul with Gemm.
-        if ((std::dynamic_pointer_cast<opset1::Constant>    (fc_input_b.get_node_shared_ptr())  ||
-             std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr())) &&
-            std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
+        auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
+        if ((std::dynamic_pointer_cast<opset1::Constant>    (fc_input_b.get_node_shared_ptr())  || fq_after_const) &&
+             std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
                return x != 1;
            }) <= 2) {
            Shape shape_a_aligned, shape_b_aligned;
@ -131,8 +131,18 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {

            // Weights normalization
            if (!matmul->get_transpose_b()) {
-                fc_input_b = create_transpose(fc_input_b, matmul->get_friendly_name() + "/transpose_b");
+                Output<ov::Node> constant = fc_input_b;
+                // transpose the constant itself, not FQ output, to allow constant folding to apply this transpose
+                if (fq_after_const) {
+                    constant = fc_input_b.get_node_shared_ptr()->input_value(0);
+                }
+                fc_input_b = create_transpose(constant, matmul->get_friendly_name() + "/transpose_b");
                new_ops.push_back(fc_input_b.get_node_shared_ptr());
+                if (fq_after_const) {
+                    fc_input_b = fq_after_const->clone_with_new_inputs(OutputVector{fc_input_b, fq_after_const->input_value(1),
+                        fq_after_const->input_value(2), fq_after_const->input_value(3), fq_after_const->input_value(4)});
+                    new_ops.push_back(fc_input_b.get_node_shared_ptr());
+                }
            }

            if (shape_b.size() != 2) {
--- a/src/plugins/intel_gna/frontend/quantization.cpp
+++ b/src/plugins/intel_gna/frontend/quantization.cpp
@ -258,6 +258,9 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const
            channel_multiplier = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
        }

+        // channel multiplier shouldn't be 0
+        channel_multiplier = channel_multiplier == 0 ? 1 : channel_multiplier;
+
        ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier + 0.5f);
        if (channel_multiplier > MAX_OUT_MULTIPLIER) {
            THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;
--- a/src/tests/functional/plugin/gna/scale_factors_tests/perchannel_quant_test.cpp
+++ b/src/tests/functional/plugin/gna/scale_factors_tests/perchannel_quant_test.cpp
@ -0,0 +1,140 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <memory>
+#include <tuple>
+#include <string>
+
+#include <ie_core.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/plugin_cache.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+#include "ngraph_functions/builders.hpp"
+
+#include "ngraph_functions/pass/convert_prc.hpp"
+
+typedef std::tuple<
+    InferenceEngine::Precision,         // Network Precision
+    std::string,                        // Target Device
+    std::map<std::string, std::string>, // Configuration
+    std::pair<float, float>,            // Weights values
+    std::vector<size_t>                 // Input shapes
+> matmulParams;
+
+namespace LayerTestsDefinitions {
+
+class PerchannelQuantTest : public testing::WithParamInterface<matmulParams>,
+    public LayerTestsUtils::LayerTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<matmulParams> obj) {
+        InferenceEngine::Precision netPrecision;
+        std::string targetDevice;
+        std::map<std::string, std::string> configuration;
+        std::pair<float, float> weightsValues;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, weightsValues, inputShape) = obj.param;
+
+        std::ostringstream result;
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice << "_";
+        for (auto const& configItem : configuration) {
+            result << "_configItem=" << configItem.first << "_" << configItem.second;
+        }
+        result << "_range=(" << weightsValues.first << ", " << weightsValues.second << ")";
+        result << "_IS=(" << CommonTestUtils::vec2str(inputShape) << ")";
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        InferenceEngine::Precision netPrecision;
+        std::pair<float, float> weightsValues;
+        std::vector<size_t> inputShape;
+        std::tie(netPrecision, targetDevice, configuration, weightsValues, inputShape) = this->GetParam();
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+        const ngraph::Shape constShape = {inputShape.back(), inputShape.back()};
+        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+        std::vector<float> weights;
+        std::vector<float> weightsMin, weightsMax;
+        for (int i = 0; i < constShape.front(); ++i) {
+            // multiplier to increase weights ranges difference between different channels
+            float mul = (i % 2 ? 1.0 : 0.001);
+            float rowMin = weightsValues.first * mul;
+            float rowMax = weightsValues.second * mul;
+            auto rowWeights = CommonTestUtils::generate_float_numbers(constShape.back(), rowMin, rowMax);
+            weights.insert(std::end(weights), std::begin(rowWeights), std::end(rowWeights));
+            weightsMin.push_back(rowMin);
+            weightsMax.push_back(rowMax);
+        }
+
+        auto constant = ngraph::builder::makeConstant<float>(ngPrc, constShape, weights);
+        auto wLowNode = ngraph::builder::makeConstant<float>(ngPrc, {constShape.front()}, { weightsMin });
+        auto wHighNode = ngraph::builder::makeConstant<float>(ngPrc, {constShape.front()}, { weightsMax });
+        auto wFq = std::make_shared<ngraph::opset8::FakeQuantize>(constant, wLowNode, wHighNode, wLowNode, wHighNode,
+            std::numeric_limits<uint8_t>::max() - 1);
+        auto matmul = std::make_shared<ngraph::opset8::MatMul>(params[0], wFq, false, true);
+
+        ngraph::ResultVector results{std::make_shared<ngraph::opset8::Result>(matmul)};
+        function = std::make_shared<ngraph::Function>(results, params, "PerchannelQuantTest");
+    }
+};
+
+TEST_P(PerchannelQuantTest, CompareWithRefImpl) {
+    LoadNetwork();
+    GenerateInputs();
+    Infer();
+    auto results = GetOutputs();
+    size_t size = results.front()->size();
+    auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(results.front());
+    IE_ASSERT(memory);
+    const auto lockedMemory = memory->wmap();
+    const auto actualBuffer = lockedMemory.as<const float*>();
+
+    // check that outputs haven't been zero out by a channel multilplier
+    for (size_t i = 0; i < size; ++i) {
+        if (actualBuffer[i] == 0.0) {
+            IE_THROW() << "Unexpected 0 output value";
+        }
+    }
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+    InferenceEngine::Precision::FP16
+};
+
+const std::vector<std::map<std::string, std::string>> configs = {
+    {
+        {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
+    }
+};
+
+const std::vector<std::pair<float, float>> weightsValues = {
+    {-0.1, 0.1},
+    {-1.0, 1.0},
+    {-10.0, 10.0}
+};
+
+const std::vector<std::vector<size_t>> inputShapes = {
+    {1, 128},
+    {1, 38},
+    {1, 8}
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_base, PerchannelQuantTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(netPrecisions),
+        ::testing::Values(CommonTestUtils::DEVICE_GNA),
+        ::testing::ValuesIn(configs),
+        ::testing::ValuesIn(weightsValues),
+        ::testing::ValuesIn(inputShapes)),
+    PerchannelQuantTest::getTestCaseName);
+} // namespace LayerTestsDefinitions