[GPU] Support FC 6d output by compressing as 4d in kernel for dynamic (#17996)
This commit is contained in:
@@ -158,9 +158,9 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
|
||||
ov::op::v0::shape_infer(&op, input_shapes, output_shapes);
|
||||
|
||||
bool is_static = input_layout.is_static() && weights_layout.is_static();
|
||||
|
||||
format::type output_format = is_static ? get_preferred_format(node, impl_param) :
|
||||
input_layout.format.value;
|
||||
bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
|
||||
format::type output_format = is_static && !allow_new_shape_infer ? get_preferred_format(node, impl_param) :
|
||||
input_layout.format.value;
|
||||
|
||||
return { layout{output_shapes[0], output_type, output_format} };
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@ public:
|
||||
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
|
||||
const auto& primitive = impl_param.typed_desc<fully_connected>();
|
||||
|
||||
auto get_fc_input_layouts = [primitive](const std::vector<layout>& input_layouts) {
|
||||
auto get_fc_input_layouts = [primitive](const std::vector<layout>& input_layouts, bool allow_new_shape_infer) {
|
||||
auto reshape_to_2d = [](const ov::PartialShape& shape, const ov::Dimension& feature, size_t rank) {
|
||||
if (shape.is_static()) {
|
||||
auto static_shape = shape.to_shape();
|
||||
@@ -56,15 +56,21 @@ public:
|
||||
auto input1_pshape = input1_layout.get_partial_shape();
|
||||
|
||||
ov::Dimension feature = input0_pshape[std::min(primitive->input_size, static_cast<size_t>(4)) - 1ul];
|
||||
if (allow_new_shape_infer) {
|
||||
feature = input0_pshape[primitive->input_size - 1ul];
|
||||
}
|
||||
|
||||
if (primitive->input_size > 3) {
|
||||
input0_layout.set_partial_shape(reshape_to_2d(input0_pshape, feature, primitive->input_size));
|
||||
input0_layout.format = format::bfyx;
|
||||
}
|
||||
if (input1_pshape.size() != 2) {
|
||||
input1_layout.set_partial_shape(reshape_to_2d(input1_pshape, feature, primitive->weights_rank));
|
||||
input1_layout.format = format::bfyx;
|
||||
}
|
||||
|
||||
std::vector<layout> layouts{input0_layout, input1_layout};
|
||||
|
||||
return layouts;
|
||||
};
|
||||
|
||||
@@ -83,9 +89,10 @@ public:
|
||||
return updated_out_layout;
|
||||
};
|
||||
|
||||
bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
|
||||
auto updated_impl_param = impl_param;
|
||||
|
||||
const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts);
|
||||
const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts, allow_new_shape_infer);
|
||||
updated_impl_param.input_layouts[0] = input_layouts[0];
|
||||
updated_impl_param.input_layouts[1] = input_layouts[1];
|
||||
updated_impl_param.weights_layout = input_layouts[1];
|
||||
@@ -137,6 +144,10 @@ attach_fully_connected_impl::attach_fully_connected_impl() {
|
||||
std::make_tuple(data_types::f16, format::yxfb),
|
||||
std::make_tuple(data_types::f32, format::bfyx),
|
||||
std::make_tuple(data_types::f16, format::bfyx),
|
||||
std::make_tuple(data_types::f32, format::bfzyx),
|
||||
std::make_tuple(data_types::f16, format::bfzyx),
|
||||
std::make_tuple(data_types::f32, format::bfwzyx),
|
||||
std::make_tuple(data_types::f16, format::bfwzyx),
|
||||
std::make_tuple(data_types::f32, format::byxf),
|
||||
std::make_tuple(data_types::f16, format::byxf),
|
||||
std::make_tuple(data_types::i8, format::bfyx),
|
||||
|
||||
@@ -1727,6 +1727,15 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
if (use_onednn_impls) {
|
||||
expected = node.get_preferred_output_fmt();
|
||||
}
|
||||
|
||||
if (!allow_new_shape_infer && node.is_type<fully_connected>()) {
|
||||
auto& fc_node = node.as<fully_connected>();
|
||||
auto input_layout = fc_node.input().get_output_layout();
|
||||
if (input_layout.format.dimension() > 4) {
|
||||
expected = format::bfyx;
|
||||
node.set_preferred_input_fmt(0, format::bfyx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (allow_new_shape_infer && node.get_preferred_input_fmt() != format::any) {
|
||||
|
||||
@@ -1892,6 +1892,111 @@ TEST(fully_connected_gpu, dynamic) {
|
||||
ASSERT_EQ(3.0f, output_ptr[3]);
|
||||
}
|
||||
|
||||
TEST(fully_connected_gpu, dynamic_6d_input) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const int32_t input_b = 1, input_f = 3, input_w = 2, input_z = 1, input_y = 1, input_x = 4;
|
||||
const int32_t weight_b = 2;
|
||||
|
||||
auto input_dyn_layout = layout{ov::PartialShape{ov::Dimension(-1), input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx};
|
||||
auto input_data = engine.allocate_memory(layout{ov::PartialShape{input_b, input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx});
|
||||
auto weights_data = engine.allocate_memory({ov::PartialShape{weight_b, input_x}, data_types::f32, format::bfyx });
|
||||
|
||||
set_values(input_data, {-0.5f, 2.0f, 0.5f, 1.f, -1.5f, 2.0f, 0.5f, 1.f,
|
||||
-0.5f, 2.5f, 0.5f, 1.f, -0.5f, 3.0f, 0.5f, 1.f,
|
||||
-0.5f, 2.0f, 0.5f, 1.f, -0.5f, 2.0f, 2.5f, 1.f});
|
||||
set_values(weights_data, {1.5f, 1.0f, -1.0f, 0.0f,
|
||||
0.5f, -0.5f, -0.5f, 1.0f, });
|
||||
|
||||
cldnn::topology topology{
|
||||
input_layout("input", input_dyn_layout),
|
||||
data("weights", weights_data),
|
||||
fully_connected("fc", input_info("input"), "weights", "", cldnn::padding(), input_dyn_layout.get_rank())
|
||||
};
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input", input_data);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = network.get_output_layout(outputs.begin()->first);
|
||||
ASSERT_EQ(output_prim_mem->get_layout().batch(), 1);
|
||||
ASSERT_EQ(out_l.batch(), 1);
|
||||
ASSERT_EQ(out_l.feature(), 3);
|
||||
ASSERT_EQ(out_l.spatial(0), 2);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
ASSERT_EQ(out_l.spatial(2), 1);
|
||||
ASSERT_EQ(out_l.spatial(3), 2);
|
||||
|
||||
std::vector<float> expected_output = {
|
||||
0.75, -0.5, -0.75, -1, 1.25, -0.75, 1.75, -1, 0.75, -0.5, -1.25, -1.5
|
||||
};
|
||||
|
||||
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
|
||||
|
||||
for (size_t i = 0 ; i < out_l.get_linear_size(); i++) {
|
||||
ASSERT_EQ(expected_output[i], output_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(fully_connected_gpu, static_6d_input) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const int32_t input_b = 1, input_f = 3, input_w = 2, input_z = 1, input_y = 1, input_x = 4;
|
||||
const int32_t weight_b = 2;
|
||||
|
||||
auto input_dyn_layout = layout{ov::PartialShape{input_b, input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx};
|
||||
auto input_data = engine.allocate_memory(input_dyn_layout);
|
||||
auto weights_data = engine.allocate_memory({ov::PartialShape{weight_b, input_x}, data_types::f32, format::bfyx });
|
||||
|
||||
set_values(input_data, {-0.5f, 2.0f, 0.5f, 1.f, -1.5f, 2.0f, 0.5f, 1.f,
|
||||
-0.5f, 2.5f, 0.5f, 1.f, -0.5f, 3.0f, 0.5f, 1.f,
|
||||
-0.5f, 2.0f, 0.5f, 1.f, -0.5f, 2.0f, 2.5f, 1.f});
|
||||
set_values(weights_data, {1.5f, 1.0f, -1.0f, 0.0f,
|
||||
0.5f, -0.5f, -0.5f, 1.0f, });
|
||||
|
||||
cldnn::topology topology{
|
||||
input_layout("input", input_dyn_layout),
|
||||
data("weights", weights_data),
|
||||
fully_connected("fc", input_info("input"), "weights", "", cldnn::padding(), input_dyn_layout.get_rank()),
|
||||
};
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input", input_data);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = network.get_output_layout(outputs.begin()->first);
|
||||
ASSERT_EQ(output_prim_mem->get_layout().batch(), 6);
|
||||
ASSERT_EQ(out_l.batch(), 6);
|
||||
ASSERT_EQ(out_l.feature(), 2);
|
||||
ASSERT_EQ(out_l.spatial(0), 1);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
|
||||
std::vector<float> expected_output = {
|
||||
0.75, -0.5, -0.75, -1, 1.25, -0.75, 1.75, -1, 0.75, -0.5, -1.25, -1.5
|
||||
};
|
||||
|
||||
cldnn::mem_lock<float> output_ptr(output_prim_mem, get_test_stream());
|
||||
|
||||
for (size_t i = 0 ; i < out_l.get_linear_size(); i++) {
|
||||
ASSERT_EQ(expected_output[i], output_ptr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
|
||||
auto& engine = get_test_engine();
|
||||
const int32_t input_f = 3, input_b = 1, weight_b = 4;
|
||||
|
||||
Reference in New Issue
Block a user