[GPU] Allow StridedSlice as predecessor for in place concat (#18836)

* Allow StridedSlice as predecessor for in place concat

* Enable padding support for strided slice

Signed-off-by: Andrew Park <andrew.park@intel.com>

* Add prepare_buffer_fusing TC for ov_gpu_unit_tests

---------

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park 2023-08-03 14:50:26 +09:00 committed by GitHub
parent 8846b5ddd0
commit 1501e29cfe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 206 additions and 1 deletions

View File

@ -12,6 +12,7 @@
#include "depth_to_space_inst.h"
#include "resample_inst.h"
#include "loop_inst.h"
#include "strided_slice_inst.h"
#include "non_max_suppression_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "border_inst.h"
@ -58,7 +59,8 @@ auto available_pred = [](const program_node& input) {
if (!input.is_type<pooling>() && !input.is_type<convolution>() && !input.is_type<quantize>() &&
!input.is_type<activation>() && !input.is_type<deconvolution>() && !input.is_type<concatenation>() &&
!input.is_type<crop>() && !input.is_type<eltwise>() && !input.is_type<resample>() &&
!input.is_type<reorder>() && !(input.is_type<permute>() && !input.as<permute>().is_rotating_except_batch()))
!input.is_type<reorder>() && !(input.is_type<permute>() && !input.as<permute>().is_rotating_except_batch()) &&
!input.is_type<strided_slice>())
return false;
return true;
};

View File

@ -15,6 +15,9 @@ namespace cldnn {
template <>
struct typed_program_node<strided_slice> : public typed_program_node_base<strided_slice> {
using parent = typed_program_node_base<strided_slice>;
typed_program_node(const std::shared_ptr<strided_slice> prim, program& prog) : parent(prim, prog) {
support_padding_all(true);
}
public:
using parent::parent;

View File

@ -18,6 +18,7 @@
#include "reorder_inst.h"
#include "shape_of_inst.h"
#include "gather_inst.h"
#include "strided_slice_inst.h"
#include "intel_gpu/graph/network.hpp"
#include "pass_manager.h"
#include "to_string_utils.h"
@ -264,6 +265,92 @@ TEST(prepare_buffer_fusing, in_place_concat_dynamic) {
}
}
TEST(prepare_buffer_fusing, in_place_concat_strided_slice_dyn) {
auto& engine = get_test_engine();
auto in_layout1_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
auto in_layout2_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
auto in_layout3_0 = layout{ ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
auto in_layout1 = layout{ ov::PartialShape{2, 2, 2, 2}, data_types::f32, format::bfyx };
auto in_layout2 = layout{ ov::PartialShape{2, 2, 2, 2}, data_types::f32, format::bfyx };
auto in_layout3 = layout{ ov::PartialShape{2, 2, 2, 2}, data_types::f32, format::bfyx };
auto begin = engine.allocate_memory({ ov::PartialShape{4}, data_types::i64, format::bfyx });
auto end = engine.allocate_memory({ ov::PartialShape{4}, data_types::i64, format::bfyx });
auto strides = engine.allocate_memory({ ov::PartialShape{4}, data_types::i64, format::bfyx });
set_values<int64_t>(begin, {0, 0, 0, 0});
set_values<int64_t>(end, {2, 2, 2, 2 });
set_values<int64_t>(strides, {1, 1, 1, 1});
topology topology;
topology.add(input_layout("input1", in_layout1_0));
topology.add(input_layout("input2", in_layout2_0));
topology.add(input_layout("input3", in_layout3_0));
topology.add(data("input4", begin));
topology.add(data("input5", end));
topology.add(data("input6", strides));
topology.add(reorder("reorder1", input_info("input1"), format::bfyx, data_types::f16));
topology.add(reorder("reorder2", input_info("input2"), format::bfyx, data_types::f16));
topology.add(reorder("reorder3", input_info("input3"), format::bfyx, data_types::f16));
topology.add(eltwise("eltwise", { input_info("reorder1"), input_info("reorder2") }, eltwise_mode::prod));
topology.add(strided_slice("strided_slice", input_info("reorder3"), input_info("input4"),
input_info("input5"), input_info("input6"), {}, {}, {}, {}, {}, {}));
topology.add(concatenation("concat", { input_info("eltwise"), input_info("strided_slice") }, 0));
topology.add(reorder("output", input_info("concat"), format::bfyx, data_types::f32));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, false);
ASSERT_NE(prog, nullptr);
cldnn::network net(prog, 0);
auto input_memory1 = engine.allocate_memory(in_layout1);
auto input_memory2 = engine.allocate_memory(in_layout2);
auto input_memory3 = engine.allocate_memory(in_layout3);
set_values<float>(input_memory1, {
1.f, 0.f, 5.f, 1.f, 2.f, 0.f, 6.f, 3.f,
3.f, 0.5f, 7.f, 12.f, 4.f, -0.5f, 8.f, 7.5f
});
set_values<float>(input_memory2, {
0.5f, 5.f, 15.f, 6.f, 0.5f, 2.f, 8.f, -0.5f,
2.5f, 7.f, 17.f, 8.f, 2.5f, 4.f, 10.f, -2.5f
});
set_values<float>(input_memory3, {
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f
});
net.set_input_data("input1", input_memory1);
net.set_input_data("input2", input_memory2);
net.set_input_data("input3", input_memory3);
std::map<cldnn::primitive_id, cldnn::network_output> output;
EXPECT_NO_THROW(output = net.execute());
const auto& concat_node = net.get_primitive("concat")->get_node();
auto concat_mem = net.get_primitive("concat")->output_memory_ptr();
auto eltwise_mem = net.get_primitive("eltwise")->output_memory_ptr();
auto strided_slice_mem = net.get_primitive("strided_slice")->output_memory_ptr();
ASSERT_TRUE(concat_node.can_be_optimized());
ASSERT_EQ(concat_mem, eltwise_mem);
ASSERT_EQ(concat_mem, strided_slice_mem);
auto out_lay = net.get_output_layout("output");
auto out_mem = output.at("output").get_memory();
cldnn::mem_lock<float> output_ptr(out_mem, get_test_stream());
std::vector<float> ref_output = {
0.5f, 0.0f, 75.f, 6.0f, 1.0f, 0.0f, 48.f, -1.5f,
7.5f, 3.5f, 119.f, 96.0f, 10.0f, -2.0f, 80.f, -18.75f,
0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f
};
for (size_t x = 0; x < out_lay.count(); ++x) {
ASSERT_EQ(ref_output[x], output_ptr[x]);
}
}
TEST(prepare_buffer_fusing, in_place_concat_dynamic_onednn_batch1) {
auto& engine = get_test_engine();
if (!engine.get_device_info().supports_immad)

View File

@ -118,6 +118,55 @@ public:
}
}
void test_2x2x2x2_full_pad(bool is_caching_test) {
// Input (BFYX): 2x2x2x2
// Begin (BFYX): 0x0x0x0
// End (BFYX): 2x2x2x2
// Stride (BFYX): 1x1x1x1
// Output (BFYX): 2x2x2x2
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ ov::PartialShape{ 2, 2, 2, 2 }, data_types::f32, format::bfyx });
set_values(input, {
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f
});
std::vector<int64_t> begin_data = { 0, 0, 0, 0 };
std::vector<int64_t> end_data = { 2, 2, 2, 2 };
std::vector<int64_t> strides_data = { 1, 1, 1, 1 };
padding in_pad({0, 0, 1, 1}, {0, 0, 1, 1});
auto padded_layout = input->get_layout().with_padding(in_pad);
topology topology;
topology.add(input_layout("input", input->get_layout()));
topology.add(reorder("input_reorder", input_info("input"), padded_layout));
topology.add(strided_slice("strided_slice", input_info("input_reorder"), begin_data, end_data, strides_data, {}, {}, {}, {}, {}, {2, 2, 2, 2}));
cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "strided_slice");
auto output = outputs.at("strided_slice").get_memory();
std::vector<float> answers = {
0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f };
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
ASSERT_EQ(output_ptr.size(), answers.size());
for (size_t i = 0; i < answers.size(); ++i)
{
ASSERT_TRUE(are_equal(answers[i], output_ptr[i]));
}
}
void test_2x2x2x2_ignore(bool is_caching_test) {
// Input (BFYX): 2x2x2x2
// Begin (BFYX): 1x1x1x1
@ -597,6 +646,54 @@ public:
}
}
void test_2x2x2x2_full_negative_stride_pad(bool is_caching_test) {
// Input (BFYX): 2x2x2x2
// Begin (BFYX): 0x0x0x0
// End (BFYX): 2x2x2x2
// Stride (BFYX): -1x-1x1x1
// Output (BFYX): 2x2x2x2
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ ov::PartialShape{ 2, 2, 2, 2 }, data_types::f32, format::bfyx });
set_values(input, {
0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f
});
std::vector<int64_t> begin_data = { 0, 0, 0, 0 };
std::vector<int64_t> end_data = { 2, 2, 2, 2 };
std::vector<int64_t> strides_data = { -1, -1, 1, 1 };
padding in_pad({0, 0, 1, 1}, {0, 0, 1, 1});
auto padded_layout = input->get_layout().with_padding(in_pad);
topology topology;
topology.add(input_layout("input", input->get_layout()));
topology.add(reorder("input_reorder", input_info("input"), padded_layout));
topology.add(strided_slice("strided_slice", input_info("input_reorder"), begin_data, end_data, strides_data, {}, {}, {}, {}, {}, {2, 2, 2, 2}));
cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "strided_slice");
auto output = outputs.at("strided_slice").get_memory();
std::vector<float> answers = {
12.f, 13.f, 14.f, 15.f, 8.f, 9.f, 10.f, 11.f, 4.f, 5.f, 6.f, 7.f, 0.f, 1.f, 2.f, 3.f };
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
for (size_t i = 0; i < answers.size(); ++i)
{
ASSERT_TRUE(are_equal(answers[i], output_ptr[i]));
}
}
void test_2x2x2x1x1_2_negative_all(bool is_caching_test) {
// Input (BFZYX): 2x2x2x1x1
// Output (BFZYX): 2x1x1x1x1
@ -1902,6 +1999,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full) {
this->test_2x2x2x2_full(false);
}
TEST_F(strided_slice_gpu, test_2x2x2x2_full_pad) {
this->test_2x2x2x2_full_pad(false);
}
TEST_F(strided_slice_gpu, test_2x2x2x2_ignore) {
this->test_2x2x2x2_ignore(false);
}
@ -1998,6 +2099,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride) {
this->test_2x2x2x2_full_negative_stride(false);
}
TEST_F(strided_slice_gpu, test_2x2x2x2_full_negative_stride_pad) {
this->test_2x2x2x2_full_negative_stride_pad(false);
}
TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride_f_axis) {
this->test_2x2x2x2_full_negative_stride_f_axis(false);
}
@ -2081,6 +2186,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_cached) {
this->test_2x2x2x2_full(true);
}
TEST_F(strided_slice_gpu, test_2x2x2x2_full_pad_cached) {
this->test_2x2x2x2_full_pad(true);
}
TEST_F(strided_slice_gpu, test_2x2x2x2_ignore_cached) {
this->test_2x2x2x2_ignore(true);
}
@ -2177,6 +2286,10 @@ TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride_cached) {
this->test_2x2x2x2_full_negative_stride(true);
}
TEST_F(strided_slice_gpu, test_2x2x2x2_full_negative_stride_pad_cached) {
this->test_2x2x2x2_full_negative_stride_pad(true);
}
TEST_F(strided_slice_gpu_constants, test_2x2x2x2_full_negative_stride_f_axis_cached) {
this->test_2x2x2x2_full_negative_stride_f_axis(true);
}