[GPU] Optimize permute for acdb format (#15139)

* [GPU] Optimize permute for acdb format Target subgraphs to be optimized-out - input(bfyx) - permute(byxf) - conv - conv(byxf) - permute(bfyx) - output + Fix test_device_mem_usage_estimation unit test failed.
2023-01-31 17:32:57 +09:00 · 2023-01-31 17:32:57 +09:00 · 06063201d5
commit 06063201d5
parent 1ae0b2796e
4 changed files with 221 additions and 3 deletions
--- a/src/plugins/intel_gpu/src/graph/include/permute_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/permute_inst.h
@ -35,6 +35,20 @@ public:
        }
        return true;
    }
+
+    bool is_reverse_rotating_except_batch() const {
+        // Target transform: Rotate feature dim to front to be taken as second outer axis
+        // ex) 0(b), 4(f), 1(x), 2(y), 3(z)
+        // ex) 0(b), 3(f), 1(x), 2(y)
+        auto& order = get_primitive()->permute_order;
+        if ((int32_t) order[order.size() - 2] != 1) return false;
+        if ((int32_t) order[0] != 0) return false;
+        for (int32_t i = 2; i < (int32_t) order.size(); ++i) {
+            if ((int32_t)order[i] != i - 1) return false;
+        }
+        return true;
+    }
+
    std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
 };

@ -53,6 +67,11 @@ public:

 public:
    typed_primitive_inst(network& network, permute_node const& node);
+    void update_output_memory() override;
+
+private:
+    void on_execute() override;
+    void reuse_input();
 };

 using permute_inst = typed_primitive_inst<permute>;
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@ -1872,7 +1872,28 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
            // WA: shallow convolution needs to set input format by bfyx.
            //     onednn recommended byxf for input format. It will insert reorder before shallow conv.
            if (node.is_type<convolution>() && node.get_input_layouts()[0].feature() == 3) {
-                src_fmt = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
+                bool can_optimize_permute = false;
+                // In permute-conv pattern, check if permute can be optimized
+                // when the input memory of permute has been aligned like byxf format.
+                // ex) pattern: input (bfyx) -> permute (byxf) -> oneDNN convolution
+                //      input layout of permute: bfyx [b:1, f:416, y:416, x:3]
+                //     output layout of permute: byxf [b:1, f:3, y:416, x:416]
+                // In this case, it can be handled by changing only the shape of permute without the kernel execution.
+                if (node.get_output_layout().get_rank() == 4 && node.get_dependency(0).is_type<permute>()) {
+                    auto& pnode = node.get_dependency(0).as<permute>();
+                    can_optimize_permute = pnode.get_users().size() == 1 && pnode.get_dependencies().size() == 1
+                        && !pnode.is_output() && pnode.get_dependency(0).get_output_layout().is_static()
+                        && pnode.is_reverse_rotating_except_batch();
+                }
+                if (!can_optimize_permute) {
+                    src_fmt = format::get_default_format(node.get_input_layouts()[0].get_rank(), false, false);
+                } else {
+                    // The size of dependencies and users must each be 1.
+                    // In permute-conv pattern, the preferred format of permute should follow previous node.
+                    node.get_dependency(0).init_preferred_fmt(1, 1);
+                    node.get_dependency(0).set_preferred_input_fmt(0, format::bfyx);
+                    node.get_dependency(0).can_be_optimized(true);
+                }
            }

            node.set_preferred_input_fmt(idx, src_fmt);
@ -1887,6 +1908,26 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
                }
            }

+            // In conv-permute pattern, sets the output format of conv to byxf so that permute can be optimized.
+            // ex) oneDNN convolution -> (byxf) -> permute -> (bfyx) -> output
+            //     output layout of convolution: byxf [b:1, f:128, y:2, x:2]
+            //     output layout of permute:     bfyx [b:1, f:2, y:2, x:128]
+            // In this case, it can be handled by changing only the shape of permute without the kernel execution.
+            if (node.get_output_layout().get_rank() == 4
+                && node.get_users().size() == 1 && node.get_users().front()->is_type<permute>()) {
+                auto& pnode = node.get_users().front()->as<permute>();
+                auto can_optimize_permute = pnode.get_dependencies().size() == 1
+                    && !pnode.is_output() && pnode.get_dependency(0).get_output_layout().is_static()
+                    && pnode.is_rotating_except_batch();
+                if (can_optimize_permute) {
+                    dst_fmt = format::byxf;
+                    pnode.init_preferred_fmt(1, 1);
+                    pnode.set_preferred_input_fmt(0, cldnn::format::byxf);
+                    pnode.set_preferred_output_fmt(0, cldnn::format::bfyx);
+                    pnode.can_be_optimized(true);
+                }
+            }
+
            if (node.get_preferred_output_fmt() == format::any) {
                for (size_t usr = 0; usr < std::max<size_t>(1, node.get_users().size()); usr++)
                    node.set_preferred_output_fmt(usr, dst_fmt);
--- a/src/plugins/intel_gpu/src/graph/permute.cpp
+++ b/src/plugins/intel_gpu/src/graph/permute.cpp
@ -38,7 +38,13 @@ layout permute_inst::calc_output_layout(permute_node const& node, kernel_impl_pa
        input_layout.data_type = impl_param.get_fused_output_layout().data_type;
    }

-    return layout(input_layout.data_type, input_layout.format, output_size, op);
+    // Adjust output format for optimizing out of transpose related to acdb format.
+    auto out_fmt = input_layout.format;
+    if (node.get_preferred_output_fmt() != format::any) {
+        out_fmt = node.get_preferred_output_fmt();
+    }
+
+    return layout(input_layout.data_type, out_fmt, output_size, op);
 }

 template<typename ShapeType>
@ -101,7 +107,8 @@ std::string permute_inst::to_string(permute_node const& node) {
    return primitive_description.str();
 }

-permute_inst::typed_primitive_inst(network& network, permute_node const& node) : parent(network, node) {
+permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
+        parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false) {
    auto permute_order = argument->permute_order;

    auto required_order_values_size = static_cast<uint32_t>(permute_order.size());
@ -110,5 +117,36 @@ permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
        if (!(std::find(permute_order.begin(), permute_order.end(), i) != permute_order.end()))
            CLDNN_ERROR_MESSAGE(node.id(), "Permute order does not contain all of required values.");
    }
+
+    if (node.can_be_optimized()) {
+        reuse_input();
+    }
 }
+
+void permute_inst::on_execute() {
+    if (can_be_optimized())
+        reuse_input();
+}
+
+void permute_inst::reuse_input() {
+    update_output_memory();
+}
+
+void permute_inst::update_output_memory() {
+    if (!can_be_optimized())
+        return;
+
+    if (_outputs.size() > 0 && static_cast<bool>(_outputs[0])
+        && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
+        return;
+
+    if (_node != nullptr)
+        build_deps();
+
+    _outputs = {_network.get_engine().reinterpret_buffer(input_memory(), _impl_params->get_output_layout())};
+    _mem_allocated = false;
+}
+
+
+
 }  // namespace cldnn
--- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
@ -4054,4 +4054,124 @@ INSTANTIATE_TEST_SUITE_P(implicit_crop_concat_conv_fusings_gpu, implicit_crop_co
    implicit_crop_concat_convolution_test_params{ CASE_CROP_FQ_CONCAT_1, 5, 9 },
 }));

+
+class PermuteOptimizingTestOnednn : public BaseFusingTest<convolution_test_params> {
+public:
+    void execute(convolution_test_params& p) {
+        if (!engine.get_device_info().supports_immad)
+            return;
+
+        p.expected_fused_primitives = p.expected_fused_primitives_onednn;
+
+        cldnn::memory::ptr input_prim = get_mem(get_input_layout(p));
+        cfg_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+        cfg_not_fused.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
+
+        network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
+        network network_fused(this->engine, this->topology_fused, cfg_fused);
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+
+        compare(network_not_fused, network_fused, p);
+        auto find_conv = [](primitive_info& p) -> bool {
+            if (p.original_id == "conv_prim")
+                return true;
+            return false;
+        };
+
+        auto pi_fused = network_fused.get_primitives_info();
+        auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv);
+        if (info_fused != pi_fused.end())
+            std::cout << "kernel: " << info_fused->kernel_id << std::endl;
+
+        auto permute_prim = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool {
+            if (p.original_id == "permute")
+                return true;
+            return false;
+        });
+
+        ASSERT_TRUE(permute_prim != pi_fused.end());
+        ASSERT_TRUE(permute_prim->kernel_id == "undef");
+    }
+
+    layout get_input_layout(convolution_test_params& p) {
+        auto pad = p.pad;
+        std::vector<int> pad_ = { 0, 0, static_cast<int>(pad[1]), static_cast<int>(pad[0]) };
+        return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
+    }
+
+    layout get_per_channel_layout(convolution_test_params& p) {
+        return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
+    }
+
+    layout get_prelu_slope_layout(convolution_test_params& p) {
+        return layout{ p.default_type, p.input_format, tensor{1, p.out_shape.feature[0], p.out_shape.spatial[0], 1} };
+    }
+};
+
+
+#define CASE_CONV_FP16_PERMUTE_1 { 1, 4, 3, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
+#define CASE_CONV_FP16_PERMUTE_2 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
+
+class conv_after_permute_optimizing : public PermuteOptimizingTestOnednn {};
+TEST_P(conv_after_permute_optimizing, basic) {
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    auto p = GetParam();
+
+    auto weights_layout = cldnn::layout { p.weights_type, p.weights_format,
+                                        cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.spatial[0]),
+                                        spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])) };
+
+    auto bias_layout = cldnn::layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} };
+
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("weights", get_mem(weights_layout)),
+        data("bias", get_mem(bias_layout)),
+        permute("permute", input_info("input"), {0, 3, 1, 2}),
+        convolution("conv_prim", input_info("permute"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
+        activation("activation", input_info("conv_prim"), activation_func::abs),
+        reorder("reorder_bfyx", input_info("activation"), p.default_format, data_types::f32)
+    );
+
+    tolerance = default_tolerance(p.default_type);
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_after_permute_optimizing, ::testing::ValuesIn(std::vector<convolution_test_params>{
+    convolution_test_params{ CASE_CONV_FP16_PERMUTE_1, 3, 2, 4 },
+}));
+
+class conv_before_permute_optimizing : public PermuteOptimizingTestOnednn {};
+TEST_P(conv_before_permute_optimizing, basic) {
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    auto p = GetParam();
+
+    ov::intel_gpu::ImplementationDesc conv_impl = { cldnn::format::type::any, "", impl_types::onednn };
+    cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv_prim", conv_impl } }));
+
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("weights", get_mem(get_weights_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        convolution("conv_prim", input_info("input"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
+        activation("activation", input_info("conv_prim"), activation_func::abs),
+        permute("permute", input_info("activation"), {0, 2, 3, 1}),
+        reorder("reorder_bfyx", input_info("permute"), p.default_format, data_types::f32)
+    );
+
+    tolerance = default_tolerance(p.default_type);
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_before_permute_optimizing, ::testing::ValuesIn(std::vector<convolution_test_params>{
+    convolution_test_params{ CASE_CONV_FP16_PERMUTE_2, 3, 2, 4 },
+}));
+
+
+
 #endif  // ENABLE_ONEDNN_FOR_GPU