[GPU] Support FC 6d output by compressing as 4d in kernel for dynamic (#17996)

This commit is contained in:
Kelvin Choi
2023-07-07 03:07:28 +09:00
committed by GitHub
parent 14292b8da5
commit f0ec940fc1
4 changed files with 130 additions and 5 deletions

View File

@@ -158,9 +158,9 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
ov::op::v0::shape_infer(&op, input_shapes, output_shapes);
bool is_static = input_layout.is_static() && weights_layout.is_static();
format::type output_format = is_static ? get_preferred_format(node, impl_param) :
input_layout.format.value;
bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
format::type output_format = is_static && !allow_new_shape_infer ? get_preferred_format(node, impl_param) :
input_layout.format.value;
return { layout{output_shapes[0], output_type, output_format} };
}

View File

@@ -37,7 +37,7 @@ public:
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<fully_connected>();
auto get_fc_input_layouts = [primitive](const std::vector<layout>& input_layouts) {
auto get_fc_input_layouts = [primitive](const std::vector<layout>& input_layouts, bool allow_new_shape_infer) {
auto reshape_to_2d = [](const ov::PartialShape& shape, const ov::Dimension& feature, size_t rank) {
if (shape.is_static()) {
auto static_shape = shape.to_shape();
@@ -56,15 +56,21 @@ public:
auto input1_pshape = input1_layout.get_partial_shape();
ov::Dimension feature = input0_pshape[std::min(primitive->input_size, static_cast<size_t>(4)) - 1ul];
if (allow_new_shape_infer) {
feature = input0_pshape[primitive->input_size - 1ul];
}
if (primitive->input_size > 3) {
input0_layout.set_partial_shape(reshape_to_2d(input0_pshape, feature, primitive->input_size));
input0_layout.format = format::bfyx;
}
if (input1_pshape.size() != 2) {
input1_layout.set_partial_shape(reshape_to_2d(input1_pshape, feature, primitive->weights_rank));
input1_layout.format = format::bfyx;
}
std::vector<layout> layouts{input0_layout, input1_layout};
return layouts;
};
@@ -83,9 +89,10 @@ public:
return updated_out_layout;
};
bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
auto updated_impl_param = impl_param;
const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts);
const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts, allow_new_shape_infer);
updated_impl_param.input_layouts[0] = input_layouts[0];
updated_impl_param.input_layouts[1] = input_layouts[1];
updated_impl_param.weights_layout = input_layouts[1];
@@ -137,6 +144,10 @@ attach_fully_connected_impl::attach_fully_connected_impl() {
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::i8, format::bfyx),

View File

@@ -1727,6 +1727,15 @@ format layout_optimizer::get_preferred_format(program_node& node) {
if (use_onednn_impls) {
expected = node.get_preferred_output_fmt();
}
if (!allow_new_shape_infer && node.is_type<fully_connected>()) {
auto& fc_node = node.as<fully_connected>();
auto input_layout = fc_node.input().get_output_layout();
if (input_layout.format.dimension() > 4) {
expected = format::bfyx;
node.set_preferred_input_fmt(0, format::bfyx);
}
}
}
if (allow_new_shape_infer && node.get_preferred_input_fmt() != format::any) {

View File

@@ -1892,6 +1892,111 @@ TEST(fully_connected_gpu, dynamic) {
ASSERT_EQ(3.0f, output_ptr[3]);
}
TEST(fully_connected_gpu, dynamic_6d_input) {
auto& engine = get_test_engine();
const int32_t input_b = 1, input_f = 3, input_w = 2, input_z = 1, input_y = 1, input_x = 4;
const int32_t weight_b = 2;
auto input_dyn_layout = layout{ov::PartialShape{ov::Dimension(-1), input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx};
auto input_data = engine.allocate_memory(layout{ov::PartialShape{input_b, input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx});
auto weights_data = engine.allocate_memory({ov::PartialShape{weight_b, input_x}, data_types::f32, format::bfyx });
set_values(input_data, {-0.5f, 2.0f, 0.5f, 1.f, -1.5f, 2.0f, 0.5f, 1.f,
-0.5f, 2.5f, 0.5f, 1.f, -0.5f, 3.0f, 0.5f, 1.f,
-0.5f, 2.0f, 0.5f, 1.f, -0.5f, 2.0f, 2.5f, 1.f});
set_values(weights_data, {1.5f, 1.0f, -1.0f, 0.0f,
0.5f, -0.5f, -0.5f, 1.0f, });
cldnn::topology topology{
input_layout("input", input_dyn_layout),
data("weights", weights_data),
fully_connected("fc", input_info("input"), "weights", "", cldnn::padding(), input_dyn_layout.get_rank())
};
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input_data);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "fc");
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), 1);
ASSERT_EQ(out_l.batch(), 1);
ASSERT_EQ(out_l.feature(), 3);
ASSERT_EQ(out_l.spatial(0), 2);
ASSERT_EQ(out_l.spatial(1), 1);
ASSERT_EQ(out_l.spatial(2), 1);
ASSERT_EQ(out_l.spatial(3), 2);
std::vector<float> expected_output = {
0.75, -0.5, -0.75, -1, 1.25, -0.75, 1.75, -1, 0.75, -0.5, -1.25, -1.5
};
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
for (size_t i = 0 ; i < out_l.get_linear_size(); i++) {
ASSERT_EQ(expected_output[i], output_ptr[i]);
}
}
TEST(fully_connected_gpu, static_6d_input) {
auto& engine = get_test_engine();
const int32_t input_b = 1, input_f = 3, input_w = 2, input_z = 1, input_y = 1, input_x = 4;
const int32_t weight_b = 2;
auto input_dyn_layout = layout{ov::PartialShape{input_b, input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx};
auto input_data = engine.allocate_memory(input_dyn_layout);
auto weights_data = engine.allocate_memory({ov::PartialShape{weight_b, input_x}, data_types::f32, format::bfyx });
set_values(input_data, {-0.5f, 2.0f, 0.5f, 1.f, -1.5f, 2.0f, 0.5f, 1.f,
-0.5f, 2.5f, 0.5f, 1.f, -0.5f, 3.0f, 0.5f, 1.f,
-0.5f, 2.0f, 0.5f, 1.f, -0.5f, 2.0f, 2.5f, 1.f});
set_values(weights_data, {1.5f, 1.0f, -1.0f, 0.0f,
0.5f, -0.5f, -0.5f, 1.0f, });
cldnn::topology topology{
input_layout("input", input_dyn_layout),
data("weights", weights_data),
fully_connected("fc", input_info("input"), "weights", "", cldnn::padding(), input_dyn_layout.get_rank()),
};
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(engine, topology, config);
network.set_input_data("input", input_data);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "fc");
auto output_prim_mem = outputs.begin()->second.get_memory();
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), 6);
ASSERT_EQ(out_l.batch(), 6);
ASSERT_EQ(out_l.feature(), 2);
ASSERT_EQ(out_l.spatial(0), 1);
ASSERT_EQ(out_l.spatial(1), 1);
std::vector<float> expected_output = {
0.75, -0.5, -0.75, -1, 1.25, -0.75, 1.75, -1, 0.75, -0.5, -1.25, -1.5
};
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
for (size_t i = 0 ; i < out_l.get_linear_size(); i++) {
ASSERT_EQ(expected_output[i], output_ptr[i]);
}
}
TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
auto& engine = get_test_engine();
const int32_t input_f = 3, input_b = 1, weight_b = 4;