From f0ec940fc15ed3a3a8283e5c61f48a449d89fac0 Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Fri, 7 Jul 2023 03:07:28 +0900 Subject: [PATCH] [GPU] Support FC 6d output by compressing as 4d in kernel for dynamic (#17996) --- .../intel_gpu/src/graph/fully_connected.cpp | 6 +- .../src/graph/impls/ocl/fully_connected.cpp | 15 ++- .../intel_gpu/src/graph/layout_optimizer.cpp | 9 ++ .../test_cases/fully_connected_gpu_test.cpp | 105 ++++++++++++++++++ 4 files changed, 130 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 281fa1b4d86..9755296009a 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -158,9 +158,9 @@ std::vector fully_connected_inst::calc_output_layouts(fully_connected_no ov::op::v0::shape_infer(&op, input_shapes, output_shapes); bool is_static = input_layout.is_static() && weights_layout.is_static(); - - format::type output_format = is_static ? get_preferred_format(node, impl_param) : - input_layout.format.value; + bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer); + format::type output_format = is_static && !allow_new_shape_infer ? get_preferred_format(node, impl_param) : + input_layout.format.value; return { layout{output_shapes[0], output_type, output_format} }; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp index a1e9081e0e6..68f56f2379e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp @@ -37,7 +37,7 @@ public: static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { const auto& primitive = impl_param.typed_desc(); - auto get_fc_input_layouts = [primitive](const std::vector& input_layouts) { + auto get_fc_input_layouts = [primitive](const std::vector& input_layouts, bool allow_new_shape_infer) { auto reshape_to_2d = [](const ov::PartialShape& shape, const ov::Dimension& feature, size_t rank) { if (shape.is_static()) { auto static_shape = shape.to_shape(); @@ -56,15 +56,21 @@ public: auto input1_pshape = input1_layout.get_partial_shape(); ov::Dimension feature = input0_pshape[std::min(primitive->input_size, static_cast(4)) - 1ul]; + if (allow_new_shape_infer) { + feature = input0_pshape[primitive->input_size - 1ul]; + } if (primitive->input_size > 3) { input0_layout.set_partial_shape(reshape_to_2d(input0_pshape, feature, primitive->input_size)); + input0_layout.format = format::bfyx; } if (input1_pshape.size() != 2) { input1_layout.set_partial_shape(reshape_to_2d(input1_pshape, feature, primitive->weights_rank)); + input1_layout.format = format::bfyx; } std::vector layouts{input0_layout, input1_layout}; + return layouts; }; @@ -83,9 +89,10 @@ public: return updated_out_layout; }; + bool allow_new_shape_infer = impl_param.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer); auto updated_impl_param = impl_param; - const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts); + const auto input_layouts = get_fc_input_layouts(impl_param.input_layouts, allow_new_shape_infer); updated_impl_param.input_layouts[0] = input_layouts[0]; updated_impl_param.input_layouts[1] = input_layouts[1]; updated_impl_param.weights_layout = input_layouts[1]; @@ -137,6 +144,10 @@ attach_fully_connected_impl::attach_fully_connected_impl() { std::make_tuple(data_types::f16, format::yxfb), std::make_tuple(data_types::f32, format::bfyx), std::make_tuple(data_types::f16, format::bfyx), + std::make_tuple(data_types::f32, format::bfzyx), + std::make_tuple(data_types::f16, format::bfzyx), + std::make_tuple(data_types::f32, format::bfwzyx), + std::make_tuple(data_types::f16, format::bfwzyx), std::make_tuple(data_types::f32, format::byxf), std::make_tuple(data_types::f16, format::byxf), std::make_tuple(data_types::i8, format::bfyx), diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 3b43b021739..4cb42270a54 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1727,6 +1727,15 @@ format layout_optimizer::get_preferred_format(program_node& node) { if (use_onednn_impls) { expected = node.get_preferred_output_fmt(); } + + if (!allow_new_shape_infer && node.is_type()) { + auto& fc_node = node.as(); + auto input_layout = fc_node.input().get_output_layout(); + if (input_layout.format.dimension() > 4) { + expected = format::bfyx; + node.set_preferred_input_fmt(0, format::bfyx); + } + } } if (allow_new_shape_infer && node.get_preferred_input_fmt() != format::any) { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index b7ed06ce9a2..2511d317853 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1892,6 +1892,111 @@ TEST(fully_connected_gpu, dynamic) { ASSERT_EQ(3.0f, output_ptr[3]); } +TEST(fully_connected_gpu, dynamic_6d_input) { + auto& engine = get_test_engine(); + + const int32_t input_b = 1, input_f = 3, input_w = 2, input_z = 1, input_y = 1, input_x = 4; + const int32_t weight_b = 2; + + auto input_dyn_layout = layout{ov::PartialShape{ov::Dimension(-1), input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx}; + auto input_data = engine.allocate_memory(layout{ov::PartialShape{input_b, input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx}); + auto weights_data = engine.allocate_memory({ov::PartialShape{weight_b, input_x}, data_types::f32, format::bfyx }); + + set_values(input_data, {-0.5f, 2.0f, 0.5f, 1.f, -1.5f, 2.0f, 0.5f, 1.f, + -0.5f, 2.5f, 0.5f, 1.f, -0.5f, 3.0f, 0.5f, 1.f, + -0.5f, 2.0f, 0.5f, 1.f, -0.5f, 2.0f, 2.5f, 1.f}); + set_values(weights_data, {1.5f, 1.0f, -1.0f, 0.0f, + 0.5f, -0.5f, -0.5f, 1.0f, }); + + cldnn::topology topology{ + input_layout("input", input_dyn_layout), + data("weights", weights_data), + fully_connected("fc", input_info("input"), "weights", "", cldnn::padding(), input_dyn_layout.get_rank()) + }; + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network network(engine, topology, config); + network.set_input_data("input", input_data); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc"); + + auto output_prim_mem = outputs.begin()->second.get_memory(); + + auto out_l = network.get_output_layout(outputs.begin()->first); + ASSERT_EQ(output_prim_mem->get_layout().batch(), 1); + ASSERT_EQ(out_l.batch(), 1); + ASSERT_EQ(out_l.feature(), 3); + ASSERT_EQ(out_l.spatial(0), 2); + ASSERT_EQ(out_l.spatial(1), 1); + ASSERT_EQ(out_l.spatial(2), 1); + ASSERT_EQ(out_l.spatial(3), 2); + + std::vector expected_output = { + 0.75, -0.5, -0.75, -1, 1.25, -0.75, 1.75, -1, 0.75, -0.5, -1.25, -1.5 + }; + + cldnn::mem_lock output_ptr(output_prim_mem, get_test_stream()); + + for (size_t i = 0 ; i < out_l.get_linear_size(); i++) { + ASSERT_EQ(expected_output[i], output_ptr[i]); + } +} + +TEST(fully_connected_gpu, static_6d_input) { + auto& engine = get_test_engine(); + + const int32_t input_b = 1, input_f = 3, input_w = 2, input_z = 1, input_y = 1, input_x = 4; + const int32_t weight_b = 2; + + auto input_dyn_layout = layout{ov::PartialShape{input_b, input_f, input_w, input_z, input_y, input_x}, data_types::f32, format::bfwzyx}; + auto input_data = engine.allocate_memory(input_dyn_layout); + auto weights_data = engine.allocate_memory({ov::PartialShape{weight_b, input_x}, data_types::f32, format::bfyx }); + + set_values(input_data, {-0.5f, 2.0f, 0.5f, 1.f, -1.5f, 2.0f, 0.5f, 1.f, + -0.5f, 2.5f, 0.5f, 1.f, -0.5f, 3.0f, 0.5f, 1.f, + -0.5f, 2.0f, 0.5f, 1.f, -0.5f, 2.0f, 2.5f, 1.f}); + set_values(weights_data, {1.5f, 1.0f, -1.0f, 0.0f, + 0.5f, -0.5f, -0.5f, 1.0f, }); + + cldnn::topology topology{ + input_layout("input", input_dyn_layout), + data("weights", weights_data), + fully_connected("fc", input_info("input"), "weights", "", cldnn::padding(), input_dyn_layout.get_rank()), + }; + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input_data); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc"); + + auto output_prim_mem = outputs.begin()->second.get_memory(); + + auto out_l = network.get_output_layout(outputs.begin()->first); + ASSERT_EQ(output_prim_mem->get_layout().batch(), 6); + ASSERT_EQ(out_l.batch(), 6); + ASSERT_EQ(out_l.feature(), 2); + ASSERT_EQ(out_l.spatial(0), 1); + ASSERT_EQ(out_l.spatial(1), 1); + + std::vector expected_output = { + 0.75, -0.5, -0.75, -1, 1.25, -0.75, 1.75, -1, 0.75, -0.5, -1.25, -1.5 + }; + + cldnn::mem_lock output_ptr(output_prim_mem, get_test_stream()); + + for (size_t i = 0 ; i < out_l.get_linear_size(); i++) { + ASSERT_EQ(expected_output[i], output_ptr[i]); + } +} + TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) { auto& engine = get_test_engine(); const int32_t input_f = 3, input_b = 1, weight_b = 4;