[Snippets] Added broadcast support for FQ weights calculation (#15837)

2023-03-01 11:14:58 +04:00 · 2023-03-01 11:14:58 +04:00 · 7f83c2c72d
commit 7f83c2c72d
parent a7bb54da2d
4 changed files with 99 additions and 28 deletions
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@ -74,7 +74,9 @@ auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
               is_type<opset1::Constant>(n->get_input_node_shared_ptr(1)) &&
               is_type<opset1::Constant>(n->get_input_node_shared_ptr(2)) &&
               is_type<opset1::Constant>(n->get_input_node_shared_ptr(3)) &&
-               is_type<opset1::Constant>(n->get_input_node_shared_ptr(4));
+               is_type<opset1::Constant>(n->get_input_node_shared_ptr(4)) &&
+               (fq->get_auto_broadcast() == ov::op::AutoBroadcastType::NUMPY ||
+                fq->get_auto_broadcast() == ov::op::AutoBroadcastType::NONE);
    };

    auto is_supported_ternary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
--- a/src/common/snippets/src/pass/fq_decomposition.cpp
+++ b/src/common/snippets/src/pass/fq_decomposition.cpp
@ -12,6 +12,8 @@
 #include <ngraph/pattern/op/wrap_type.hpp>
 #include <ngraph/rt_info.hpp>
 #include <ngraph/pass/manager.hpp>
+#include <ngraph/runtime/reference/autobroadcast_binop.hpp>
+#include <ngraph/runtime/reference/broadcast.hpp>
 #include <numeric>

 namespace {
@ -68,6 +70,7 @@ ngraph::snippets::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
        const Output<Node> output_low{fake_quantize_node->input_value(3)};
        const Output<Node> output_high{fake_quantize_node->input_value(4)};
        auto input_type = data.get_element_type();
+        auto broadcast_type = fake_quantize_node->get_auto_broadcast();

        std::vector<float> out_scales;
        std::vector<float> cl, ch, isc, ish, osc, osh;
@ -107,7 +110,7 @@ ngraph::snippets::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
            PartialShape scale_shape = input_low.get_partial_shape();
            ngraph::PartialShape::broadcast_merge_into(scale_shape,
                                                       input_high.get_partial_shape(),
-                                                       ov::op::AutoBroadcastType::NUMPY);
+                                                       broadcast_type);
            const auto scales =
                std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, scale_shape.get_shape(), out_scales);
            decomp_ops.push_back(scales);
@ -205,48 +208,103 @@ bool ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(
    if (!input_low_constant || !input_high_constant || !output_low_constant || !output_high_constant)
        return false;

+    const auto input_low_shape = input_low_constant->get_shape();
+    const auto input_high_shape = input_high_constant->get_shape();
+    const auto output_low_shape = output_low_constant->get_shape();
+    const auto output_high_shape = output_high_constant->get_shape();
+
    auto input_low = input_low_constant->cast_vector<float>();
    auto input_high = input_high_constant->cast_vector<float>();
    auto output_low = output_low_constant->cast_vector<float>();
    auto output_high = output_high_constant->cast_vector<float>();
    auto levels = fq_node->get_levels();
+    auto broadcast_type = fq_node->get_auto_broadcast();

-    const auto input_size = std::max(input_low.size(), input_high.size());
-    const auto output_size = std::max(output_low.size(), output_high.size());
+    // We have two ways for computations of scales and shifts to avoid model compilation time growth
+    // because common function "ngraph::runtime::reference::autobroadcast_binop()" is expensive:
+    //  - A usual case (weights with the same shapes or scalars) - optimal calculations without large broadcasting
+    //  - A rare case ("general broadcasting") - common computations using autobroadcast_binop() call with broadcasting support

-    cl = input_low;
-    ch = input_high;
-    isc.resize(input_size, 0);
-    ish.resize(input_size, 0);
-    osc.resize(output_size, 0);
-    osh.resize(output_size, 0);
+    // Calculations of input scales and shift:
+    //   - isc := (levels-1) / (ih - il)
+    //   - ish := -il * isc
+    if (input_low_shape == input_high_shape || shape_size(input_low_shape) == 1 || shape_size(input_high_shape) == 1) {
+        const auto input_size = std::max(input_low.size(), input_high.size());
+        isc.resize(input_size, 0);
+        ish.resize(input_size, 0);
+        for (size_t i = 0; i < input_size; i++) {
+            float il = input_low[input_low.size() == 1 ? 0 : i];
+            float ih = input_high[input_high.size() == 1 ? 0 : i];

-    for (size_t i = 0; i < input_size; i++) {
-        float il = input_low[input_low.size() == 1 ? 0 : i];
-        float ih = input_high[input_high.size() == 1 ? 0 : i];
-
-        isc[i] = (levels - 1) / (ih - il);
-        ish[i] = -il * isc[i];
+            isc[i] = (levels - 1) / (ih - il);
+            ish[i] = -il * isc[i];
+        }
+        cl = input_low;
+        ch = input_high;
+    } else {  // general broadcasting
+        PartialShape scale_pshape = input_low_constant->get_output_partial_shape(0);
+        PartialShape::broadcast_merge_into(scale_pshape, input_high_shape, broadcast_type);
+        const auto scale_shape = scale_pshape.get_shape();
+        const auto input_size = ngraph::shape_size(scale_shape);
+        isc.resize(input_size, 0);
+        ish.resize(input_size, 0);
+        ngraph::runtime::reference::autobroadcast_binop(input_high.data(), input_low.data(), isc.data(),
+                                                        input_high_shape, input_low_shape, broadcast_type,
+                                                        [levels](float x, float y) -> float { return (levels - 1) / (x - y); });
+        ngraph::runtime::reference::autobroadcast_binop(input_low.data(), isc.data(), ish.data(),
+                                                        input_low_shape, scale_shape, broadcast_type,
+                                                        [](float x, float y) -> float { return -x * y; });
+        auto broadcast = [](const std::vector<float>& original_data, std::vector<float>& out_data,
+                            const ov::Shape& original_shape, const ov::Shape& out_shape, size_t size) -> void {
+            out_data.resize(size, 0);
+            std::vector<size_t> broadcast_axes(out_shape.size() - original_shape.size());
+            std::iota(broadcast_axes.begin(), broadcast_axes.end(), 0);
+            ngraph::runtime::reference::broadcast(reinterpret_cast<const char*>(original_data.data()),
+                                                  reinterpret_cast<char*>(out_data.data()),
+                                                  original_shape,
+                                                  out_shape,
+                                                  broadcast_axes,
+                                                  sizeof(float));
+        };
+        broadcast(input_low, cl, input_low_shape, scale_shape, input_size);
+        broadcast(input_high, ch, input_high_shape, scale_shape, input_size);
    }

-    for (size_t i = 0; i < output_size; i++) {
-        float ol = output_low[output_low.size() == 1 ? 0 : i];
-        float oh = output_high[output_high.size() == 1 ? 0 : i];
+    // Calculations of output scales and shift:
+    //   - osc := (oh - ol) / (levels-1)
+    //   - osh := ol
+    if (output_low_shape == output_high_shape || shape_size(output_low_shape) == 1 || shape_size(output_high_shape) == 1) {
+        const auto output_size = std::max(output_low.size(), output_high.size());
+        osc.resize(output_size, 0);
+        osh.resize(output_size, 0);
+        for (size_t i = 0; i < output_size; i++) {
+            float ol = output_low[output_low.size() == 1 ? 0 : i];
+            float oh = output_high[output_high.size() == 1 ? 0 : i];

-        osc[i] = (oh - ol) / (levels - 1);
-        osh[i] = ol;
+            osc[i] = (oh - ol) / (levels - 1);
+            osh[i] = ol;
+        }
+    } else {  // general broadcasting
+        PartialShape scale_pshape = output_low_constant->get_output_partial_shape(0);
+        PartialShape::broadcast_merge_into(scale_pshape, output_high_constant->get_output_partial_shape(0), broadcast_type);
+        const auto output_size = ngraph::shape_size(scale_pshape.get_shape());
+        osc.resize(output_size, 0);
+        ngraph::runtime::reference::autobroadcast_binop(output_high.data(), output_low.data(), osc.data(),
+                                                        output_high_shape, output_low_shape, broadcast_type,
+                                                        [levels](float x, float y) -> float { return (x - y) / (levels - 1); });
+        osh = output_low;
    }

    return true;
 }

 std::vector<float> ngraph::snippets::pass::FakeQuantizeDecomposition::calculateScales(const ngraph::element::Type& out_type,
-                                                                            const std::vector<float>& cl,
-                                                                            const std::vector<float>& ch,
-                                                                            const std::vector<float>& isc,
-                                                                            const std::vector<float>& ish,
-                                                                            const std::vector<float>& osc,
-                                                                            const std::vector<float>& osh) {
+                                                                                      const std::vector<float>& cl,
+                                                                                      const std::vector<float>& ch,
+                                                                                      const std::vector<float>& isc,
+                                                                                      const std::vector<float>& ish,
+                                                                                      const std::vector<float>& osc,
+                                                                                      const std::vector<float>& osh) {
    std::vector<float> out_scales;
    if (out_type == ngraph::element::u8 &&
        std::all_of(cl.cbegin(),
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@ -190,7 +190,10 @@ std::vector<std::string> disabledTestPatterns() {
        R"(nightly_dynamic_bf16/RNNSequenceCPUTest.*activations=\(relu\).*)",
        R"(smoke_dynamic_BatchSizeOne/RNNSequenceCPUTest.*IS=\(\[1\.\?\.10\]_\[1\.1\.10\]_\[\?\]_\)_TS=\{\(1\.2\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.4\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.8\.10\)_\(1\.1\.10\)_\(1\)\}_seqMode=PURE_SEQ_activations=\(relu\)_clip=0_direction=forward_netPrec=f32__inFmts=ncw\.ntc_outFmts=ncw\.ncw_primitive=ref_any)", // NOLINT
        // 98151. Not valid sorting for slices in reference.
-        R"(.*UniqueLayerTestCPU.*axis.*True.*)"
+        R"(.*UniqueLayerTestCPU.*axis.*True.*)",
+        // Issue: 104402. Incorrect broadcasting in FQ reference implentation
+        R"(.*smoke_FakeQuantizeLayerCPUTest_Decompos.*IS=\[4\.5\.6\.6\]_TS=\(\(4\.5\.6\.6\)\)_RS=\(\(1\.1\.6\.6\)\)_\(\(1\.1\.6\.6\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.5\.1\.6\)\).*)",
+        R"(.*smoke_FakeQuantizeLayerCPUTest_Decompos.*IS=\[4\.5\.6\.6\]_TS=\(\(4\.5\.6\.6\)\)_RS=\(\(1\.5\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.5\.1\.6\)\).*)",
    };

 #define FIX_62820 0
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/fake_quantize.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/fake_quantize.cpp
@ -332,6 +332,14 @@ std::vector<inputShapes> decomposeShapes = {
        InputShape{{4, 5, 6, 7}, {{4, 5, 6, 7}}},
        {{1, 1, 6, 1}, {1, 5, 6, 7}, {1, 1, 6, 1}, {1, 1, 6, 1}}
    },
+    inputShapes{
+        InputShape{{4, 5, 6, 6}, {{4, 5, 6, 6}}},
+        {{1, 1, 6, 6}, {1, 1, 6, 6}, {1, 5, 6, 1}, {1, 5, 1, 6}}
+    },
+    inputShapes{
+        InputShape{{4, 5, 6, 6}, {{4, 5, 6, 6}}},
+        {{1, 5, 6, 1}, {1, 5, 6, 1}, {1, 5, 6, 1}, {1, 5, 1, 6}}
+    },
    inputShapes{
        InputShape{{3, 4, 5, 6, 7}, {{3, 4, 5, 6, 7}}},
        {{4, 5, 6, 7}, {4, 5, 6, 7}, {4, 5, 6, 7}, {4, 5, 6, 7}}