From 1501e29cfec7d21a219d9928d426fae612a599d6 Mon Sep 17 00:00:00 2001 From: Andrew Kwangwoong Park Date: Thu, 3 Aug 2023 14:50:26 +0900 Subject: [PATCH] [GPU] Allow StridedSlice as predecessor for in place concat (#18836) * Allow StridedSlice as predecessor for in place concat * Enable padding support for strided slice Signed-off-by: Andrew Park * Add prepare_buffer_fusing TC for ov_gpu_unit_tests --------- Signed-off-by: Andrew Park --- .../graph_optimizer/prepare_buffer_fusing.cpp | 4 +- .../src/graph/include/strided_slice_inst.h | 3 + .../passes/prepare_buffer_fusing_test.cpp | 87 ++++++++++++++ .../test_cases/strided_slice_gpu_test.cpp | 113 ++++++++++++++++++ 4 files changed, 206 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 964986147f7..fe9296db3cc 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -12,6 +12,7 @@ #include "depth_to_space_inst.h" #include "resample_inst.h" #include "loop_inst.h" +#include "strided_slice_inst.h" #include "non_max_suppression_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "border_inst.h" @@ -58,7 +59,8 @@ auto available_pred = [](const program_node& input) { if (!input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && !input.is_type() && - !input.is_type() && !(input.is_type() && !input.as().is_rotating_except_batch())) + !input.is_type() && !(input.is_type() && !input.as().is_rotating_except_batch()) && + !input.is_type()) return false; return true; }; diff --git a/src/plugins/intel_gpu/src/graph/include/strided_slice_inst.h b/src/plugins/intel_gpu/src/graph/include/strided_slice_inst.h index dfcd6bb36aa..d51f7a3c783 100644 --- a/src/plugins/intel_gpu/src/graph/include/strided_slice_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/strided_slice_inst.h @@ -15,6 +15,9 @@ namespace cldnn { template <> struct typed_program_node : public typed_program_node_base { using parent = typed_program_node_base; + typed_program_node(const std::shared_ptr prim, program& prog) : parent(prim, prog) { + support_padding_all(true); + } public: using parent::parent; diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index df15781c72b..eddfb396177 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -18,6 +18,7 @@ #include "reorder_inst.h" #include "shape_of_inst.h" #include "gather_inst.h" +#include "strided_slice_inst.h" #include "intel_gpu/graph/network.hpp" #include "pass_manager.h" #include "to_string_utils.h" @@ -264,6 +265,92 @@ TEST(prepare_buffer_fusing, in_place_concat_dynamic) { } } +TEST(prepare_buffer_fusing, in_place_concat_strided_slice_dyn) { + auto& engine = get_test_engine(); + auto in_layout1_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx }; + auto in_layout2_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx }; + auto in_layout3_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx }; + auto in_layout1 = layout{ ov::PartialShape{2, 2, 2, 2}, data_types::f32, format::bfyx }; + auto in_layout2 = layout{ ov::PartialShape{2, 2, 2, 2}, data_types::f32, format::bfyx }; + auto in_layout3 = layout{ ov::PartialShape{2, 2, 2, 2}, data_types::f32, format::bfyx }; + auto begin = engine.allocate_memory({ ov::PartialShape{4}, data_types::i64, format::bfyx }); + auto end = engine.allocate_memory({ ov::PartialShape{4}, data_types::i64, format::bfyx }); + auto strides = engine.allocate_memory({ ov::PartialShape{4}, data_types::i64, format::bfyx }); + set_values(begin, {0, 0, 0, 0}); + set_values(end, {2, 2, 2, 2 }); + set_values(strides, {1, 1, 1, 1}); + + topology topology; + topology.add(input_layout("input1", in_layout1_0)); + topology.add(input_layout("input2", in_layout2_0)); + topology.add(input_layout("input3", in_layout3_0)); + topology.add(data("input4", begin)); + topology.add(data("input5", end)); + topology.add(data("input6", strides)); + topology.add(reorder("reorder1", input_info("input1"), format::bfyx, data_types::f16)); + topology.add(reorder("reorder2", input_info("input2"), format::bfyx, data_types::f16)); + topology.add(reorder("reorder3", input_info("input3"), format::bfyx, data_types::f16)); + topology.add(eltwise("eltwise", { input_info("reorder1"), input_info("reorder2") }, eltwise_mode::prod)); + topology.add(strided_slice("strided_slice", input_info("reorder3"), input_info("input4"), + input_info("input5"), input_info("input6"), {}, {}, {}, {}, {}, {})); + topology.add(concatenation("concat", { input_info("eltwise"), input_info("strided_slice") }, 0)); + topology.add(reorder("output", input_info("concat"), format::bfyx, data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + auto prog = program::build_program(engine, topology, config, false, false); + ASSERT_NE(prog, nullptr); + cldnn::network net(prog, 0); + + auto input_memory1 = engine.allocate_memory(in_layout1); + auto input_memory2 = engine.allocate_memory(in_layout2); + auto input_memory3 = engine.allocate_memory(in_layout3); + set_values(input_memory1, { + 1.f, 0.f, 5.f, 1.f, 2.f, 0.f, 6.f, 3.f, + 3.f, 0.5f, 7.f, 12.f, 4.f, -0.5f, 8.f, 7.5f + }); + set_values(input_memory2, { + 0.5f, 5.f, 15.f, 6.f, 0.5f, 2.f, 8.f, -0.5f, + 2.5f, 7.f, 17.f, 8.f, 2.5f, 4.f, 10.f, -2.5f + }); + set_values(input_memory3, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }); + + net.set_input_data("input1", input_memory1); + net.set_input_data("input2", input_memory2); + net.set_input_data("input3", input_memory3); + + std::map output; + EXPECT_NO_THROW(output = net.execute()); + + const auto& concat_node = net.get_primitive("concat")->get_node(); + auto concat_mem = net.get_primitive("concat")->output_memory_ptr(); + auto eltwise_mem = net.get_primitive("eltwise")->output_memory_ptr(); + auto strided_slice_mem = net.get_primitive("strided_slice")->output_memory_ptr(); + + ASSERT_TRUE(concat_node.can_be_optimized()); + ASSERT_EQ(concat_mem, eltwise_mem); + ASSERT_EQ(concat_mem, strided_slice_mem); + + auto out_lay = net.get_output_layout("output"); + auto out_mem = output.at("output").get_memory(); + cldnn::mem_lock output_ptr(out_mem, get_test_stream()); + + std::vector ref_output = { + 0.5f, 0.0f, 75.f, 6.0f, 1.0f, 0.0f, 48.f, -1.5f, + 7.5f, 3.5f, 119.f, 96.0f, 10.0f, -2.0f, 80.f, -18.75f, + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f + }; + + for (size_t x = 0; x < out_lay.count(); ++x) { + ASSERT_EQ(ref_output[x], output_ptr[x]); + } +} + TEST(prepare_buffer_fusing, in_place_concat_dynamic_onednn_batch1) { auto& engine = get_test_engine(); if (!engine.get_device_info().supports_immad) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/strided_slice_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/strided_slice_gpu_test.cpp index 4555395228a..71e193b47ab 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/strided_slice_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/strided_slice_gpu_test.cpp @@ -118,6 +118,55 @@ public: } } + void test_2x2x2x2_full_pad(bool is_caching_test) { + // Input (BFYX): 2x2x2x2 + // Begin (BFYX): 0x0x0x0 + // End (BFYX): 2x2x2x2 + // Stride (BFYX): 1x1x1x1 + // Output (BFYX): 2x2x2x2 + + auto& engine = get_test_engine(); + auto input = engine.allocate_memory({ ov::PartialShape{ 2, 2, 2, 2 }, data_types::f32, format::bfyx }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }); + std::vector begin_data = { 0, 0, 0, 0 }; + std::vector end_data = { 2, 2, 2, 2 }; + std::vector strides_data = { 1, 1, 1, 1 }; + + padding in_pad({0, 0, 1, 1}, {0, 0, 1, 1}); + auto padded_layout = input->get_layout().with_padding(in_pad); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(reorder("input_reorder", input_info("input"), padded_layout)); + topology.add(strided_slice("strided_slice", input_info("input_reorder"), begin_data, end_data, strides_data, {}, {}, {}, {}, {}, {2, 2, 2, 2})); + + cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input); + + auto outputs = network->execute(); + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f }; + + cldnn::mem_lock output_ptr(output, get_test_stream()); + + ASSERT_EQ(output_ptr.size(), answers.size()); + for (size_t i = 0; i < answers.size(); ++i) + { + ASSERT_TRUE(are_equal(answers[i], output_ptr[i])); + } + } + void test_2x2x2x2_ignore(bool is_caching_test) { // Input (BFYX): 2x2x2x2 // Begin (BFYX): 1x1x1x1 @@ -597,6 +646,54 @@ public: } } + void test_2x2x2x2_full_negative_stride_pad(bool is_caching_test) { + // Input (BFYX): 2x2x2x2 + // Begin (BFYX): 0x0x0x0 + // End (BFYX): 2x2x2x2 + // Stride (BFYX): -1x-1x1x1 + // Output (BFYX): 2x2x2x2 + + auto& engine = get_test_engine(); + auto input = engine.allocate_memory({ ov::PartialShape{ 2, 2, 2, 2 }, data_types::f32, format::bfyx }); + + set_values(input, { + 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f + }); + std::vector begin_data = { 0, 0, 0, 0 }; + std::vector end_data = { 2, 2, 2, 2 }; + std::vector strides_data = { -1, -1, 1, 1 }; + + padding in_pad({0, 0, 1, 1}, {0, 0, 1, 1}); + auto padded_layout = input->get_layout().with_padding(in_pad); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(reorder("input_reorder", input_info("input"), padded_layout)); + topology.add(strided_slice("strided_slice", input_info("input_reorder"), begin_data, end_data, strides_data, {}, {}, {}, {}, {}, {2, 2, 2, 2})); + + cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input); + + auto outputs = network->execute(); + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "strided_slice"); + + auto output = outputs.at("strided_slice").get_memory(); + + std::vector answers = { + 12.f, 13.f, 14.f, 15.f, 8.f, 9.f, 10.f, 11.f, 4.f, 5.f, 6.f, 7.f, 0.f, 1.f, 2.f, 3.f }; + + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (size_t i = 0; i < answers.size(); ++i) + { + ASSERT_TRUE(are_equal(answers[i], output_ptr[i])); + } + } + void test_2x2x2x1x1_2_negative_all(bool is_caching_test) { // Input (BFZYX): 2x2x2x1x1 // Output (BFZYX): 2x1x1x1x1 @@ -1902,6 +1999,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full) { this->test_2x2x2x2_full(false); } +TEST_F(strided_slice_gpu, test_2x2x2x2_full_pad) { + this->test_2x2x2x2_full_pad(false); +} + TEST_F(strided_slice_gpu, test_2x2x2x2_ignore) { this->test_2x2x2x2_ignore(false); } @@ -1998,6 +2099,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride) { this->test_2x2x2x2_full_negative_stride(false); } +TEST_F(strided_slice_gpu, test_2x2x2x2_full_negative_stride_pad) { + this->test_2x2x2x2_full_negative_stride_pad(false); +} + TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride_f_axis) { this->test_2x2x2x2_full_negative_stride_f_axis(false); } @@ -2081,6 +2186,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_cached) { this->test_2x2x2x2_full(true); } +TEST_F(strided_slice_gpu, test_2x2x2x2_full_pad_cached) { + this->test_2x2x2x2_full_pad(true); +} + TEST_F(strided_slice_gpu, test_2x2x2x2_ignore_cached) { this->test_2x2x2x2_ignore(true); } @@ -2177,6 +2286,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride_cached) { this->test_2x2x2x2_full_negative_stride(true); } +TEST_F(strided_slice_gpu, test_2x2x2x2_full_negative_stride_pad_cached) { + this->test_2x2x2x2_full_negative_stride_pad(true); +} + TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride_f_axis_cached) { this->test_2x2x2x2_full_negative_stride_f_axis(true); }