diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/concat_input_order.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/concat_input_order.cpp index 8f8614df116..c47167e5dbd 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/concat_input_order.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/concat_input_order.cpp @@ -131,10 +131,12 @@ void concat_input_order::run(program& p) { bool no_fusing = !concat_node.has_fused_primitives() && concat_node.get_dependencies().size() == inputs_count; auto out_format = concat_node.get_output_layout().format; - bool correct_format = out_format == format::b_fs_yx_fsv16; + bool correct_format = (out_format == format::b_fs_yx_fsv16) || (out_format == format::b_fs_yx_fsv32); tensor::value_type alignment = 1; if (out_format == format::b_fs_yx_fsv16) alignment = 16; + else if (out_format == format::b_fs_yx_fsv32) + alignment = 32; bool single_format = true; std::vector feature_sizes; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index a9154dbc194..a2bd0420a39 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -120,10 +120,13 @@ bool concat_in_place_optimization::match(concatenation_node& node) { } // For in place concatenation input layouts and data types must match. + // Also, it checks whether data along f-axis is aligned properly for implicit concat. + // Otherwise, use explicit concat instead. auto output_format = node.get_output_layout().format; auto output_datatype = node.get_output_layout().data_type; auto concat_axis = node.get_primitive()->axis; + size_t idx = 0; for (auto& input : node.get_dependencies()) { if (input->is_type()) // reshapes should be optimized out. @@ -134,24 +137,25 @@ bool concat_in_place_optimization::match(concatenation_node& node) { if (output_format != l.format || output_datatype != l.data_type) return false; + if (l.format.block_sizes().size() > 1) + return false; + // TODO: Below condition should be moved to program_node::supports_padding. - // This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases. - // It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated. - if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return false; + // This however will need updating the algorithm as it may make cascade adjustment impossible in some cases. + // It however would make normal optimizations possible in others, so this is a trade-off to be investigated. + if (idx != node.get_dependencies().size() - 1) { + if ((l.format == format::b_fs_yx_fsv16 || l.format == format::b_fs_zyx_fsv16) && + (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) + return false; - if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return false; + if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) && + (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) + return false; - if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) && - (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return false; - - if (l.format == format::bs_fs_yx_bsv16_fsv16) - return false; - - if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f)) - return false; + if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 4 || node.get_primitive()->axis != concatenation::along_f)) + return false; + } + idx++; } auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis]; @@ -159,7 +163,7 @@ bool concat_in_place_optimization::match(concatenation_node& node) { node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]); // check if concatenation in place can be applied for inputs set - size_t idx = 0; + idx = 0; for (auto input : node.get_dependencies()) { // reverted condition - if any of this node's inputs is used by more than one primitive // and is not optimized concatenation then do not fuse buffers diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 0862f4fba7b..e732095bdb3 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1494,10 +1494,12 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format for (auto& dep : node.get_dependencies()) { if (dep->is_in_data_flow() && dep->get_preferred_impl_type() == impl_types::onednn) { - preferred_impl = impl_types::onednn; - break; + return impl_types::onednn; } } + if (format::is_blocked(node.get_output_layout().format)) { + return impl_types::onednn; + } // TODO: uncomment this code when onednn gemm implementations will have real perf improvements vs cldnn } else if (node.is_type()/* || node.is_type()*/) { if (!_optimization_attributes.use_onednn_impls) diff --git a/src/plugins/intel_gpu/tests/test_cases/concatenation_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/concatenation_gpu_test.cpp index f7080df32c0..84da3f44b90 100644 --- a/src/plugins/intel_gpu/tests/test_cases/concatenation_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/concatenation_gpu_test.cpp @@ -784,6 +784,152 @@ INSTANTIATE_TEST_SUITE_P(smoke_low_precision, ), concat_gpu::PrintToStringParamName); +template +struct concat_gpu_4d_implicit : public concat_gpu { +public: + cldnn::memory::ptr run_concat_network(std::vector>>>> input, format::type fmt, build_options options) { + auto data_type = type_to_data_type::value; + auto& engine = get_test_engine(); + const size_t batch_num = testing::get<0>(GetParam()); + const std::vector in_features = testing::get<1>(GetParam()); + const size_t input_y = testing::get<2>(GetParam()); + const size_t input_x = testing::get<3>(GetParam()); + size_t output_f = 0; + for (auto& f : in_features) + output_f += f; + + topology topology; + + std::vector in_memory; + std::vector input_ids; + std::vector pooling_ids; + + for (size_t i = 0; i < in_features.size(); i++) { + auto size = tensor(static_cast(batch_num), + static_cast(in_features[i]), + static_cast(input_x), + static_cast(input_y)); + auto data = input[i]; + auto in_lay = layout(data_type, fmt, size); + auto data_flat = std::vector(in_lay.get_linear_size(), 0); + + for (size_t bi = 0; bi < batch_num; ++bi) { + for (size_t fi = 0; fi < in_features[i]; ++fi) { + for (size_t yi = 0; yi < input_y; ++yi) { + for (size_t xi = 0; xi < input_x; ++xi) { + auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0)); + auto in_offset = in_lay.get_linear_offset(coords); + data_flat[in_offset] = data[bi][fi][yi][xi]; + } + } + } + } + + auto in_mem = engine.allocate_memory(in_lay); + set_values(in_mem, data_flat); + in_memory.push_back(in_mem); + + topology.add(input_layout("input" + std::to_string(i), in_lay)); + topology.add(pooling("pool" + std::to_string(i), "input" + std::to_string(i), pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1})); + + input_ids.push_back("input" + std::to_string(i)); + pooling_ids.push_back("pool" + std::to_string(i)); + } + + topology.add(concatenation("concat", pooling_ids, concatenation::concatenation_axis::along_f)); + auto weights_lay = cldnn::layout(data_type, cldnn::format::bfyx, tensor(batch(output_f), feature(output_f))); + auto weights_mem = engine.allocate_memory(weights_lay); + weights_mem->fill(get_test_stream()); + get_test_stream().finish(); + { + cldnn::mem_lock weights_ptr(weights_mem, get_test_stream()); + for (size_t fi = 0; fi < output_f; ++fi) { + auto coords = tensor(batch(fi), feature(fi), spatial(0, 0, 0, 0)); + auto offset = weights_lay.get_linear_offset(coords); + weights_ptr[offset] = static_cast(1.f); + } + } + topology.add(data("weights" , weights_mem)); + topology.add(convolution("conv", "concat", { "weights" })); + topology.add(pooling("pool_final", "conv", pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1})); + topology.add(reorder("reorder", "pool_final", layout(data_type, format::bfyx, {(int32_t)batch_num, (int32_t)output_f, (int32_t)input_y, (int32_t)input_x}))); + + network concat_network(engine, topology, options); + for (size_t i = 0; i < in_features.size(); i++) { + concat_network.set_input_data(input_ids[i], in_memory[i]); + } + concat_network.execute(); + + for (auto i : concat_network.get_primitives_info()) { + // std::cout << " " << i.original_id << " " << i.kernel_id << std::endl; + if (i.original_id == "concat") { + if (options.get()->enabled()) { + EXPECT_TRUE(i.kernel_id == "undef"); + } else { + EXPECT_FALSE(i.kernel_id == "undef"); + } + } + } + + return concat_network.get_output("reorder").get_memory(); + } + + std::vector>>>> generate_input() { + const size_t batch_num = testing::get<0>(GetParam()); + const std::vector in_features = testing::get<1>(GetParam()); + const size_t input_y = testing::get<2>(GetParam()); + const size_t input_x = testing::get<3>(GetParam()); + + std::vector>>>> input(in_features.size()); + for (size_t i = 0; i < in_features.size(); ++i) { + input[i] = generate_random_4d(batch_num, in_features[i], input_y, input_x, -1, 1); + } + return input; + } + + void test(format::type fmt) { + auto input = generate_input(); + + // implicit concat + build_options options1; + options1.set_option(build_option::optimize_data(true)); + auto out_mem1 = run_concat_network(input, fmt, options1); + cldnn::mem_lock out_ptr1(out_mem1, get_test_stream()); + + // explicit concat + build_options options2; + options2.set_option(build_option::optimize_data(false)); + auto out_mem2 = run_concat_network(input, fmt, options2); + cldnn::mem_lock out_ptr2(out_mem2, get_test_stream()); + + EXPECT_EQ(out_ptr1.size(), out_ptr2.size()); + size_t diff_count = 0; + for (size_t i = 0; i < out_ptr1.size(); ++i) { + if (out_ptr1[i] != out_ptr2[i]) diff_count++; + } + EXPECT_EQ(diff_count, 0); + } +}; + +using concat_implicit_gpu_4d_f16 = concat_gpu_4d_implicit; +using concat_implicit_gpu_4d_i8 = concat_gpu_4d_implicit; + +TEST_P(concat_implicit_gpu_4d_f16, input_order_opt_b_fs_yx_fsv16) { + ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv16)); +} + +INSTANTIATE_TEST_SUITE_P(smoke, + concat_implicit_gpu_4d_f16, + ::testing::Values( + TestParamType_concat(1, { 16, 16 }, 2, 2), + TestParamType_concat(1, { 16, 8 }, 2, 2), + TestParamType_concat(1, { 8, 16 }, 2, 2) + ), + concat_gpu::PrintToStringParamName); + +TEST_P(concat_implicit_gpu_4d_i8, input_order_opt_b_fs_yx_fsv32) { + ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv32)); +} #ifdef ENABLE_ONEDNN_FOR_GPU TEST(concat_gpu_onednn, basic_input_types) { @@ -856,4 +1002,163 @@ TEST(concat_gpu_onednn, basic_input_types) { EXPECT_EQ(output_vec[x], output_ptr[x]); } } + +template +struct concat_gpu_4d_implicit_onednn : public concat_gpu { +public: + cldnn::memory::ptr run_concat_network(std::vector>>>> input, format::type fmt, build_options options) { + auto data_type = type_to_data_type::value; + auto& engine = get_onednn_test_engine(); + const size_t batch_num = testing::get<0>(GetParam()); + const std::vector in_features = testing::get<1>(GetParam()); + const size_t input_y = testing::get<2>(GetParam()); + const size_t input_x = testing::get<3>(GetParam()); + size_t output_f = 0; + for (auto& f : in_features) + output_f += f; + + topology topology; + + std::vector in_memory; + std::vector input_ids; + std::vector pooling_ids; + + for (size_t i = 0; i < in_features.size(); i++) { + auto size = tensor(static_cast(batch_num), + static_cast(in_features[i]), + static_cast(input_x), + static_cast(input_y)); + auto data = input[i]; + auto in_lay = layout(data_type, fmt, size); + auto data_flat = std::vector(in_lay.get_linear_size(), 0); + + for (size_t bi = 0; bi < batch_num; ++bi) { + for (size_t fi = 0; fi < in_features[i]; ++fi) { + for (size_t yi = 0; yi < input_y; ++yi) { + for (size_t xi = 0; xi < input_x; ++xi) { + auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0)); + auto in_offset = in_lay.get_linear_offset(coords); + data_flat[in_offset] = data[bi][fi][yi][xi]; + } + } + } + } + + auto in_mem = engine.allocate_memory(in_lay); + set_values(in_mem, data_flat); + in_memory.push_back(in_mem); + + topology.add(input_layout("input" + std::to_string(i), in_lay)); + topology.add(pooling("pool" + std::to_string(i), "input" + std::to_string(i), pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1})); + + input_ids.push_back("input" + std::to_string(i)); + pooling_ids.push_back("pool" + std::to_string(i)); + } + + topology.add(concatenation("concat", pooling_ids, concatenation::concatenation_axis::along_f)); + auto weights_lay = cldnn::layout(data_type, cldnn::format::bfyx, tensor(batch(output_f), feature(output_f))); + auto weights_mem = engine.allocate_memory(weights_lay); + weights_mem->fill(get_test_stream()); + get_test_stream().finish(); + { + cldnn::mem_lock weights_ptr(weights_mem, get_test_stream()); + for (size_t fi = 0; fi < output_f; ++fi) { + auto coords = tensor(batch(fi), feature(fi), spatial(0, 0, 0, 0)); + auto offset = weights_lay.get_linear_offset(coords); + weights_ptr[offset] = static_cast(1.f); + } + } + topology.add(data("weights" , weights_mem)); + topology.add(convolution("conv", "concat", { "weights" })); + topology.add(pooling("pool_final", "conv", pooling_mode::max, {1, 1, 1, 1}, {1, 1, 1, 1})); + topology.add(reorder("reorder", "pool_final", layout(data_type, format::bfyx, {(int32_t)batch_num, (int32_t)output_f, (int32_t)input_y, (int32_t)input_x}))); + + network concat_network(engine, topology, options); + for (size_t i = 0; i < in_features.size(); i++) { + concat_network.set_input_data(input_ids[i], in_memory[i]); + } + concat_network.execute(); + + for (auto i : concat_network.get_primitives_info()) { + // std::cout << " " << i.original_id << " " << i.kernel_id << std::endl; + if (i.original_id == "concat") { + if (options.get()->enabled()) { + EXPECT_TRUE(i.kernel_id == "undef"); + } else { + EXPECT_FALSE(i.kernel_id == "undef"); + } + } + } + + return concat_network.get_output("reorder").get_memory(); + } + + std::vector>>>> generate_input() { + const size_t batch_num = testing::get<0>(GetParam()); + const std::vector in_features = testing::get<1>(GetParam()); + const size_t input_y = testing::get<2>(GetParam()); + const size_t input_x = testing::get<3>(GetParam()); + + std::vector>>>> input(in_features.size()); + for (size_t i = 0; i < in_features.size(); ++i) { + input[i] = generate_random_4d(batch_num, in_features[i], input_y, input_x, -1, 1); + } + return input; + } + + void test(format::type fmt) { + auto input = generate_input(); + + // implicit concat + build_options options1; + options1.set_option(build_option::optimize_data(true)); + implementation_desc impl = { fmt, std::string(""), impl_types::onednn }; + options1.set_option(build_option::force_implementations({ {"conv", impl} })); + auto out_mem1 = run_concat_network(input, fmt, options1); + cldnn::mem_lock out_ptr1(out_mem1, get_test_stream()); + + // explicit concat + build_options options2; + options2.set_option(build_option::optimize_data(false)); + auto out_mem2 = run_concat_network(input, fmt, options2); + cldnn::mem_lock out_ptr2(out_mem2, get_test_stream()); + + EXPECT_EQ(out_ptr1.size(), out_ptr2.size()); + size_t diff_count = 0; + for (size_t i = 0; i < out_ptr1.size(); ++i) { + if (out_ptr1[i] != out_ptr2[i]) diff_count++; + } + EXPECT_EQ(diff_count, 0); + } +}; + + +using concat_implicit_gpu_onednn_4d_f16 = concat_gpu_4d_implicit_onednn; +using concat_implicit_gpu_onednn_4d_i8 = concat_gpu_4d_implicit_onednn; + +TEST_P(concat_implicit_gpu_onednn_4d_f16, input_order_opt_b_fs_yx_fsv16) { + ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv16)); +} + +INSTANTIATE_TEST_SUITE_P(smoke, + concat_implicit_gpu_onednn_4d_f16, + ::testing::Values( + TestParamType_concat(1, { 16, 16 }, 2, 2), + TestParamType_concat(1, { 16, 8 }, 2, 2), + TestParamType_concat(1, { 8, 16 }, 2, 2) + ), + concat_gpu::PrintToStringParamName); + +TEST_P(concat_implicit_gpu_onednn_4d_i8, input_order_opt_b_fs_yx_fsv32) { + ASSERT_NO_FATAL_FAILURE(test(format::b_fs_yx_fsv32)); +} + +INSTANTIATE_TEST_SUITE_P(smoke, + concat_implicit_gpu_onednn_4d_i8, + ::testing::Values( + TestParamType_concat(1, { 32, 32 }, 2, 2), + TestParamType_concat(1, { 32, 8 }, 2, 2), + TestParamType_concat(1, { 8, 32 }, 2, 2) + ), + concat_gpu::PrintToStringParamName); #endif