SharedOpOptimization (#18622)

* SharedOpOptimization * PR Comments adressed * Misprint * Removed unnecessary mode_ref * Fixed memcmp and modified tests accordingly * Style * Comment from Sergey resolved * Optimize operations in groups * Removed Transpose fusion per agreement with Dmitry * Style * Resolved unnecessary looping of the graph
2023-07-25 15:00:44 +04:00 · 2023-07-25 15:00:44 +04:00 · 6be083d37e
commit 6be083d37e
parent d3fdc761f6
4 changed files with 431 additions and 0 deletions
--- a/src/common/transformations/include/transformations/common_optimizations/shared_ops_optimization.hpp
+++ b/src/common/transformations/include/transformations/common_optimizations/shared_ops_optimization.hpp
@ -0,0 +1,27 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <openvino/pass/graph_rewrite.hpp>
+#include <transformations_visibility.hpp>
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API SharedOpOptimization;
+
+}  // namespace pass
+}  // namespace ov
+
+/**
+ * @ingroup ie_transformation_common_api
+ * @brief SharedOpOptimization optimizes operations which are
+ * sourcing from same Output<Node> and perform the same action on the same data
+ */
+class ov::pass::SharedOpOptimization : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("SharedOpOptimization", "0");
+    bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
+};
--- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
@ -59,6 +59,7 @@
 #include <transformations/common_optimizations/ric_fusion.hpp>
 #include <transformations/common_optimizations/select_with_one_value_condition.hpp>
 #include <transformations/common_optimizations/sequence_fusion.hpp>
+#include <transformations/common_optimizations/shared_ops_optimization.hpp>
 #include <transformations/common_optimizations/shuffle_channels_fusion.hpp>
 #include <transformations/common_optimizations/simplify_shape_of_sub_graph.hpp>
 #include <transformations/common_optimizations/softmax_fusion.hpp>
@ -243,6 +244,7 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr<ngraph::Fu
    fq_fusions->set_name("ov::pass::FakeQuantizeFusions");
    REGISTER_PASS(manager, ReverseInputChannelsFusion)
    REGISTER_PASS(manager, AlignEltwiseInputRanks)
+    REGISTER_PASS(manager, SharedOpOptimization)
    REGISTER_PASS(manager, ConstantFolding)
    manager.run_passes(f);

--- a/src/common/transformations/src/transformations/common_optimizations/shared_ops_optimization.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/shared_ops_optimization.cpp
@ -0,0 +1,142 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <openvino/core/validation_util.hpp>
+#include <openvino/op/concat.hpp>
+#include <openvino/op/gather_elements.hpp>
+#include <openvino/op/reshape.hpp>
+#include <openvino/op/slice.hpp>
+#include <openvino/op/tile.hpp>
+#include <openvino/op/util/sub_graph_base.hpp>
+#include <transformations/common_optimizations/shared_ops_optimization.hpp>
+
+#include "itt.hpp"
+
+using namespace std;
+using namespace ov;
+using namespace ov::op;
+
+bool shared_node_optimization(const shared_ptr<Model>& model,
+                              const unordered_map<Node::type_info_t, bool (*)(const Node*, const Node*)>& rules) {
+    bool rewritten = false;
+
+    for (const auto& op : model->get_ordered_ops()) {
+        // Recursively apply transformation for sub-graph based operations
+        if (auto multi_subgraph_op = dynamic_pointer_cast<op::util::MultiSubGraphOp>(op)) {
+            for (size_t i = 0; i < multi_subgraph_op->get_internal_subgraphs_size(); i++) {
+                if (auto sub_graph = multi_subgraph_op->get_function(i))
+                    rewritten |= shared_node_optimization(sub_graph, rules);
+            }
+        }
+        for (auto& output : op->outputs()) {
+            const auto& target_inputs = output.get_target_inputs();
+            if (target_inputs.size() <= 1)
+                continue;  // nothing to optimize
+            unordered_map<Node::type_info_t, vector<Node*>> type_to_node;
+            for (const auto& input : target_inputs) {
+                auto node = input.get_node();
+                if (node && rules.count(node->get_type_info()))
+                    type_to_node[node->get_type_info()].push_back(node);
+            }
+            for (auto& item : type_to_node) {
+                const auto& shared_nodes = item.second;
+                if (shared_nodes.size() < 2)
+                    continue;
+                const auto& ops_type = item.first;
+                const auto& are_equal = rules.at(ops_type);
+
+                std::vector<bool> visited_nodes(shared_nodes.size(), false);
+                for (size_t i = 0; i < visited_nodes.size(); ++i) {
+                    if (visited_nodes[i])
+                        continue;
+                    const auto& root_op = shared_nodes[i];
+                    visited_nodes[i] = true;
+                    for (size_t j = i + 1; j < visited_nodes.size(); ++j) {
+                        if (visited_nodes[j])
+                            continue;
+                        const auto& child_op = shared_nodes[j];
+                        if (are_equal(root_op, child_op)) {
+                            rewritten |= replace_output_update_name(child_op->output(0), root_op->output(0));
+                            visited_nodes[j] = true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return rewritten;
+}
+
+bool inputs_from_same_source_or_equal_constants(const Node* lhs, const Node* rhs) {
+    if (lhs->get_input_size() != rhs->get_input_size())
+        return false;
+    size_t input_size = lhs->get_input_size();
+    for (size_t i = 0; i < input_size; ++i) {
+        if (lhs->input_value(i) == rhs->input_value(i))
+            continue;
+        auto lhs_constant = as_type_ptr<v0::Constant>(lhs->get_input_node_shared_ptr(i));
+        auto rhs_constant = as_type_ptr<v0::Constant>(rhs->get_input_node_shared_ptr(i));
+        if (!lhs_constant || !rhs_constant)
+            return false;
+        if (lhs_constant->get_element_type() != rhs_constant->get_element_type())
+            return false;
+        const auto& lhs_shape = lhs_constant->get_shape();
+        if (lhs_shape != rhs_constant->get_shape() || shape_size(lhs_shape) > 10)
+            return false;
+        if (memcmp(lhs_constant->get_data_ptr(), rhs_constant->get_data_ptr(), lhs_constant->get_byte_size()) != 0)
+            return false;
+    }
+    return true;
+}
+
+bool concats_are_equal(const Node* lhs, const Node* rhs) {
+    const auto lhs_concat = as_type<const v0::Concat>(lhs);
+    if (!lhs_concat)
+        return false;
+    const auto rhs_concat = as_type<const v0::Concat>(rhs);
+    if (!rhs_concat)
+        return false;
+    return lhs_concat->get_axis() == rhs_concat->get_axis() && inputs_from_same_source_or_equal_constants(lhs, rhs);
+}
+
+bool gather_elements_are_equal(const Node* lhs, const Node* rhs) {
+    const auto lhs_gather_elements = as_type<const v6::GatherElements>(lhs);
+    if (!lhs_gather_elements)
+        return false;
+    const auto rhs_gather_elements = as_type<const v6::GatherElements>(rhs);
+    if (!rhs_gather_elements)
+        return false;
+    return lhs_gather_elements->get_axis() == rhs_gather_elements->get_axis() &&
+           inputs_from_same_source_or_equal_constants(lhs, rhs);
+}
+
+bool reshapes_are_equal(const Node* lhs, const Node* rhs) {
+    const auto lhs_reshape = as_type<const v1::Reshape>(lhs);
+    if (!lhs_reshape)
+        return false;
+    const auto rhs_reshape = as_type<const v1::Reshape>(rhs);
+    if (!rhs_reshape)
+        return false;
+    return lhs_reshape->get_special_zero() == rhs_reshape->get_special_zero() &&
+           inputs_from_same_source_or_equal_constants(lhs, rhs);
+}
+
+bool pass::SharedOpOptimization::run_on_model(const shared_ptr<Model>& model) {
+    RUN_ON_FUNCTION_SCOPE(SharedOpOptimization);
+#define RECORD(operation, func) \
+    { operation::get_type_info_static(), func }
+
+    const unordered_map<Node::type_info_t, bool (*)(const Node*, const Node*)> rules = {
+        // no attributes
+        RECORD(v8::Slice, inputs_from_same_source_or_equal_constants),
+        RECORD(v0::Tile, inputs_from_same_source_or_equal_constants),
+
+        // with attributes
+        RECORD(v0::Concat, concats_are_equal),
+        RECORD(v6::GatherElements, gather_elements_are_equal),
+        RECORD(v1::Reshape, reshapes_are_equal),
+
+    };  // TODO: use visit_attributes to uniformly perform attributes check in the future and get rid of rules table
+    return shared_node_optimization(model, rules);
+}
--- a/src/common/transformations/tests/common_optimizations/shared_ops_optimization.cpp
+++ b/src/common/transformations/tests/common_optimizations/shared_ops_optimization.cpp
@ -0,0 +1,260 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <gtest/gtest.h>
+
+#include <transformations/common_optimizations/shared_ops_optimization.hpp>
+
+#include "common_test_utils/ngraph_test_utils.hpp"
+#include "openvino/op/concat.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/slice.hpp"
+#include "openvino/op/tile.hpp"
+
+using namespace ov;
+using namespace ov::op;
+
+class SharedTransformationTestsF : public TransformationTestsF {
+public:
+    void TearDown() override {
+        TransformationTestsF::TearDown();
+        size_t op_count = model->get_ops().size(), op_count_ref = model_ref->get_ops().size();
+        EXPECT_EQ(op_count, op_count_ref) << "Number of operations differ between models: model op count = " << op_count
+                                          << " ref_model op count = " << op_count_ref;
+    };
+
+    static Output<Node> make_slice(const Output<Node>& out,
+                                   const int64_t& start,
+                                   const int64_t& stop,
+                                   const int64_t& step,
+                                   const int64_t& axis) {
+        return std::make_shared<v8::Slice>(out,
+                                           v0::Constant::create(element::i64, Shape{1}, {start}),
+                                           v0::Constant::create(element::i64, Shape{1}, {stop}),
+                                           v0::Constant::create(element::i64, Shape{1}, {step}),
+                                           v0::Constant::create(element::i64, Shape{1}, {axis}));
+    }
+
+    static Output<Node> make_tile(const Output<Node>& out, const std::vector<int64_t>& repeats) {
+        return std::make_shared<v0::Tile>(out, v0::Constant::create(element::i64, Shape{repeats.size()}, repeats));
+    }
+
+    static Output<Node> make_reshape(const Output<Node>& out, const std::vector<int64_t>& order) {
+        return std::make_shared<v1::Reshape>(out, v0::Constant::create(element::i64, Shape{order.size()}, order), true);
+    }
+};
+
+TEST_F(SharedTransformationTestsF, SharedSlice) {
+    {
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1, -1, -1, -1});
+
+        auto slice_0 = make_slice(data, 1, 2, 3, 3);
+        auto slice_1 = make_slice(data, 1, 2, 3, 3);
+        auto slice_2 = make_slice(data, 1, 3, 3, 3);
+        auto slice_3 = make_slice(data, 1, 2, 3, 3);
+        auto slice_4 = make_slice(data, 1, 2, 3, 3);
+
+        auto concat = std::make_shared<v0::Concat>(OutputVector{slice_0, slice_1, slice_2, slice_3, slice_4}, 0);
+        model = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+        manager.register_pass<ov::pass::SharedOpOptimization>();
+    }
+    {
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1, -1, -1, -1});
+
+        auto slice_0 = make_slice(data, 1, 2, 3, 3);
+        auto slice_2 = make_slice(data, 1, 3, 3, 3);
+
+        auto concat = std::make_shared<v0::Concat>(OutputVector{slice_0, slice_0, slice_2, slice_0, slice_0}, 0);
+        model_ref = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+    }
+}
+
+TEST_F(SharedTransformationTestsF, SharedRecursively) {
+    {
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1, -1, -1, -1});
+
+        auto slice_0 = make_slice(data, 1, 2, 3, 3);
+        auto slice_1 = make_slice(data, 1, 2, 3, 3);
+        auto slice_2 = make_slice(data, 1, 3, 3, 3);
+
+        auto tile_0_0 = make_tile(slice_0, {1, 2, 3, 4});
+        auto transpose_0_0 = make_reshape(slice_0, {0, 0, 0, -1});
+        auto tile_0_1 = make_tile(slice_0, {1, 2, 3, 4});
+        auto transpose_0_1 = make_reshape(slice_0, {0, 0, 0, -1});
+        auto tile_0_2 = make_tile(slice_0, {1, 2, 3, 4});
+        auto transpose_0_2 = make_reshape(slice_0, {0, 0, 0, -1});
+
+        auto tile_1_0 = make_tile(slice_1, {1, 2, 3, 4});
+        auto transpose_1_0 = make_reshape(slice_1, {0, 0, 0, -1});
+        auto tile_1_1 = make_tile(slice_1, {1, 2, 3, 4});
+        auto transpose_1_1 = make_reshape(slice_1, {0, 0, 0, -1});
+        auto tile_1_2 = make_tile(slice_1, {1, 2, 3, 4});
+        auto transpose_1_2 = make_reshape(slice_1, {0, 0, 0, -1});
+
+        auto tile_2_0 = make_tile(slice_2, {1, 2, 3, 4});
+        auto transpose_2_0 = make_reshape(slice_2, {0, 0, 0, -1});
+        auto tile_2_1 = make_tile(slice_2, {1, 2, 3, 4});
+        auto transpose_2_1 = make_reshape(slice_2, {0, 0, 0, -1});
+        auto tile_2_2 = make_tile(slice_2, {1, 2, 3, 4});
+        auto transpose_2_2 = make_reshape(slice_2, {0, 0, 0, -1});
+
+        auto concat = std::make_shared<v0::Concat>(
+            OutputVector{// source from slice 0
+                         tile_0_0,
+                         transpose_0_0,
+                         tile_0_1,
+                         transpose_0_1,
+                         tile_0_2,
+                         transpose_0_2,
+                         // source from slice 1
+                         tile_1_0,
+                         transpose_1_0,
+                         tile_1_1,
+                         transpose_1_1,
+                         tile_1_2,
+                         transpose_1_2,
+                         // source from slice 2
+                         tile_2_0,
+                         transpose_2_0,
+                         tile_2_1,
+                         transpose_2_1,
+                         tile_2_2,
+                         transpose_2_2},
+            0);
+
+        model = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+        manager.register_pass<ov::pass::SharedOpOptimization>();
+    }
+    {
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1, -1, -1, -1});
+
+        auto slice_0 = make_slice(data, 1, 2, 3, 3);
+        auto slice_2 = make_slice(data, 1, 3, 3, 3);
+
+        auto tile_0_0 = make_tile(slice_0, {1, 2, 3, 4});
+        auto transpose_0_0 = make_reshape(slice_0, {0, 0, 0, -1});
+
+        auto tile_2_0 = make_tile(slice_2, {1, 2, 3, 4});
+        auto transpose_2_0 = make_reshape(slice_2, {0, 0, 0, -1});
+
+        auto concat = std::make_shared<v0::Concat>(
+            OutputVector{// source from slice 0
+                         tile_0_0,
+                         transpose_0_0,
+                         tile_0_0,
+                         transpose_0_0,
+                         tile_0_0,
+                         transpose_0_0,
+                         // source from slice 0
+                         tile_0_0,
+                         transpose_0_0,
+                         tile_0_0,
+                         transpose_0_0,
+                         tile_0_0,
+                         transpose_0_0,
+                         // source from slice 2
+                         tile_2_0,
+                         transpose_2_0,
+                         tile_2_0,
+                         transpose_2_0,
+                         tile_2_0,
+                         transpose_2_0},
+            0);
+
+        model_ref = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+    }
+}
+
+TEST_F(SharedTransformationTestsF, SharedConcat) {
+    {
+        auto pre_constant_0 = v0::Constant::create(element::f32, Shape{4}, std::vector<float>{3.14f, 42.f, 0.f, 14.f});
+        auto pre_constant_1 = v0::Constant::create(element::f32, Shape{4}, std::vector<float>{3.14f, 42.f, 0.f, 14.f});
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1});
+        auto post_constant = v0::Constant::create(element::f32, Shape{1}, std::vector<float>{3.14f});
+
+        auto concat_0 = std::make_shared<v0::Concat>(OutputVector{pre_constant_0, data, post_constant}, 0);
+        auto concat_1 = std::make_shared<v0::Concat>(OutputVector{pre_constant_1, data, post_constant}, 0);
+
+        auto concat = std::make_shared<v0::Concat>(OutputVector{concat_0, concat_1}, 0);
+        model = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+        manager.register_pass<ov::pass::SharedOpOptimization>();
+    }
+    {
+        auto pre_constant_0 = v0::Constant::create(element::f32, Shape{4}, std::vector<float>{3.14f, 42.f, 0.f, 14.f});
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1});
+        auto post_constant = v0::Constant::create(element::f32, Shape{1}, std::vector<float>{3.14f});
+
+        auto concat_0 = std::make_shared<v0::Concat>(OutputVector{pre_constant_0, data, post_constant}, 0);
+
+        auto concat = std::make_shared<v0::Concat>(OutputVector{concat_0, concat_0}, 0);
+        model_ref = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+    }
+}
+
+TEST_F(SharedTransformationTestsF, SharedSliceInThreeGroups) {
+    {
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape::dynamic(10));
+
+        auto slice_0_0 = make_slice(data, 1, 2, 3, 4);
+        auto slice_1_0 = make_slice(data, 2, 3, 4, 5);
+        auto slice_2_0 = make_slice(data, 3, 4, 5, 6);
+
+        auto slice_0_1 = make_slice(data, 1, 2, 3, 4);
+        auto slice_1_1 = make_slice(data, 2, 3, 4, 5);
+        auto slice_2_1 = make_slice(data, 3, 4, 5, 6);
+
+        auto slice_0_2 = make_slice(data, 1, 2, 3, 4);
+        auto slice_1_2 = make_slice(data, 2, 3, 4, 5);
+        auto slice_2_2 = make_slice(data, 3, 4, 5, 6);
+
+        auto concat = std::make_shared<v0::Concat>(OutputVector{slice_0_0,
+                                                                slice_1_0,
+                                                                slice_2_0,
+                                                                slice_0_1,
+                                                                slice_1_1,
+                                                                slice_2_1,
+                                                                slice_0_2,
+                                                                slice_1_2,
+                                                                slice_2_2},
+                                                   0);
+
+        model = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+        manager.register_pass<ov::pass::SharedOpOptimization>();
+    }
+    {
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape::dynamic(10));
+
+        auto slice_0_0 = make_slice(data, 1, 2, 3, 4);
+        auto slice_1_0 = make_slice(data, 2, 3, 4, 5);
+        auto slice_2_0 = make_slice(data, 3, 4, 5, 6);
+
+        auto concat = std::make_shared<v0::Concat>(OutputVector{slice_0_0,
+                                                                slice_1_0,
+                                                                slice_2_0,
+                                                                slice_0_0,
+                                                                slice_1_0,
+                                                                slice_2_0,
+                                                                slice_0_0,
+                                                                slice_1_0,
+                                                                slice_2_0},
+                                                   0);
+
+        model_ref = std::make_shared<ov::Model>(OutputVector{concat}, ParameterVector{data});
+    }
+}
+
+TEST_F(SharedTransformationTestsF, SharedConcatCheckOpWithResultIsntReplaced) {
+    {
+        auto pre_constant_0 = v0::Constant::create(element::f32, Shape{4}, std::vector<float>{3.14f, 42.f, 0.f, 14.f});
+        auto pre_constant_1 = v0::Constant::create(element::f32, Shape{4}, std::vector<float>{3.14f, 42.f, 0.f, 14.f});
+        auto data = std::make_shared<v0::Parameter>(element::f32, PartialShape{-1});
+        auto post_constant = v0::Constant::create(element::f32, Shape{1}, std::vector<float>{3.14f});
+
+        auto concat_0 = std::make_shared<v0::Concat>(OutputVector{pre_constant_0, data, post_constant}, 0);
+        auto concat_1 = std::make_shared<v0::Concat>(OutputVector{pre_constant_1, data, post_constant}, 0);
+
+        model = std::make_shared<ov::Model>(OutputVector{concat_0, concat_1}, ParameterVector{data});
+        manager.register_pass<ov::pass::SharedOpOptimization>();
+    }
+}