From be786ee7afdda05742eab3e7108a42f9c7695020 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 18 Nov 2021 16:48:30 +0300 Subject: [PATCH] [GPU] Do not add paddings for OneDNN primitives (#8619) * Fix review comments for Convolutions * [GPU] Do not add paddings for OneDNN primitives * Add unit test for correct padding between onednn and cldnn primitives --- .../src/graph_optimizer/prepare_padding.cpp | 44 ++++++----- .../thirdparty/clDNN/src/layout_optimizer.cpp | 2 +- .../thirdparty/clDNN/src/program.cpp | 16 ++-- .../tests/test_cases/convolution_gpu_test.cpp | 77 +++++++++++++++++++ 4 files changed, 110 insertions(+), 29 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp index ea373488b70..8c7b7bf2c3e 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp @@ -17,10 +17,30 @@ void prepare_padding::run(program& p) { if (output_size_handling_enabled) { // Prepare upper padding for primitives that support output_size parameter. for (const auto& node : p.get_processing_order()) { + if (node->get_dependencies().empty()) + continue; + + if (node->get_dependency(0).is_type()) + continue; + // Padded offsets aren't supported by onednn kernels if (node->get_preferred_impl_type() == impl_types::onednn) continue; + auto add_required_padding = [&p](program_node& node, padding& needed_padding) { + // Add extra reorder for cldnn primitive to handle required padding if needed + auto& input = node.get_dependency(0); + if (input.get_preferred_impl_type() == impl_types::onednn && + node.get_preferred_impl_type() == impl_types::ocl && + static_cast(needed_padding)) { + auto new_reorder = std::make_shared(node.id() + "_padding_reorder_for_" + input.id(), input.id(), input.get_output_layout()); + auto& new_reorder_node = p.get_or_create(new_reorder); + p.add_intermediate(new_reorder_node, node, input); + } + + p.apply_needed_padding(node, node.get_dependency(0), needed_padding); + }; + if (node->is_type()) { auto& prim_node = node->as(); const auto& prim = prim_node.get_primitive(); @@ -36,10 +56,6 @@ void prepare_padding::run(program& p) { format == format::b_fs_zyx_fsv32) continue; - if (prim_node.input().is_type()) { - continue; - } - auto filter_size = prim_node.weights(0).get_output_layout().size; auto needed_padding = calc_sliding_window_needed_input_padding(prim_node.input().get_output_layout(), @@ -51,7 +67,7 @@ void prepare_padding::run(program& p) { false, 1); - p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + add_required_padding(prim_node, needed_padding); } else if (node->is_type()) { auto& prim_node = node->as(); const auto& prim = prim_node.get_primitive(); @@ -59,10 +75,6 @@ void prepare_padding::run(program& p) { if (!prim->with_output_size) continue; - if (prim_node.input().is_type()) { - continue; - } - auto filter_size = prim_node.weights(0).get_output_layout().size; auto needed_padding = calc_sliding_window_needed_input_padding(prim_node.input().get_output_layout(), @@ -74,7 +86,7 @@ void prepare_padding::run(program& p) { true, 1); - p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + add_required_padding(prim_node, needed_padding); } else if (node->is_type()) { auto& prim_node = node->as(); const auto& prim = prim_node.get_primitive(); @@ -82,10 +94,6 @@ void prepare_padding::run(program& p) { if (!prim->with_output_size) continue; - if (prim_node.input().is_type()) { - continue; - } - padding needed_padding; // WA for this format. sliding window needs to be fixed --perf degradation for IncepctionV1 type models if (node->get_output_layout().format == format::b_fs_yx_fsv16) @@ -100,17 +108,13 @@ void prepare_padding::run(program& p) { else needed_padding = prim_node.input().get_output_layout().data_padding; - p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + add_required_padding(prim_node, needed_padding); } else if (node->is_type()) { auto& prim_node = node->as(); - if (prim_node.input().is_type()) { - continue; - } - auto needed_padding = prim_node.input().get_output_layout().data_padding; - p.apply_needed_padding(prim_node, prim_node.input(), needed_padding); + add_required_padding(prim_node, needed_padding); } } } diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index c2b8aed63f4..e94a43591be 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -828,7 +828,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, int ofm_per_group = output_layout.size.feature[0] / prim->groups; int ifm_per_group = input_layout.size.feature[0] / prim->groups; int compute_block = 32; - bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0); bool valid_int8_dw = is_dw && output_layout.size.batch[0] % 16 == 0; bool non_grouped = prim->groups == 1; bool is_2d = input_layout.format.spatial_num() == 2; @@ -854,6 +853,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, if (use_onednn_impls) { /* ***************************** OneDNN impls format selection part ****************************** */ + bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0); if (i8_u8_input) { if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) { if (input_layout.size.batch[0] % 16 == 0) { diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index 74f07313392..51a95234764 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -774,18 +774,18 @@ void program::add_intermediate(program_node& node, } void program::add_intermediate(std::shared_ptr prim, - program_node& next, - size_t prev_idx, - bool connect_int_node_with_old_dep, - bool move_usrs_of_prev_to_node) { + program_node& next, + size_t prev_idx, + bool connect_int_node_with_old_dep, + bool move_usrs_of_prev_to_node) { add_intermediate(get_or_create(prim), next, prev_idx, connect_int_node_with_old_dep, move_usrs_of_prev_to_node); } void program::add_intermediate(program_node& node, - program_node& next, - program_node& prev, - bool connect_int_node_with_old_dep, - bool move_usrs_of_prev_to_node) { + program_node& next, + program_node& prev, + bool connect_int_node_with_old_dep, + bool move_usrs_of_prev_to_node) { bool node_found = false; size_t idx = 0; for (size_t i = 0; i < next.get_dependencies().size(); i++) { diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index 9723b067564..ae4202cba16 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -9066,4 +9066,81 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) { } } +TEST(convolution_gpu_onednn, padding_for_cldnn_kernel_after_onednn) { + auto& engine = get_onednn_test_engine(); + + int input_b = 1, input_f = 16, input_y = 3, input_x = 3; + int output_b = 1, output_f = 16, output_y = 6, output_x = 6; + + auto input_size = tensor(input_b, input_f, input_x, input_y); + auto input_data = generate_random_4d(input_b, input_f, input_y, input_x, -1, 1); + auto input_data_bfyx = flatten_4d(format::bfyx, input_data); + auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size }); + set_values(input_mem, input_data_bfyx); + + auto weights_size = tensor(16, 16, 1, 1, 1); + auto weights_data = generate_random_4d(output_f, input_f, 1, 1, -1, 1); + auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data); + auto weights_mem = engine.allocate_memory({data_types::f16, format::bfyx, weights_size}); + set_values(weights_mem, weights_data_bfyx); + + auto input = input_layout("input", input_mem->get_layout()); + auto weights = data("weights", weights_mem); + auto input_reorder = reorder("input_fsv", "input", {data_types::f16, format::b_fs_yx_fsv16, input_size}); + auto conv1 = convolution("conv1", "input_fsv", { "weights" }); + auto conv2 = convolution("conv2", "conv1", { "weights" }, {1, 1, 1, 1}, {0, 0, -1, -1}, {1, 1, 1, 1}, {output_b, output_f, output_x, output_x}); + auto output_reorder = reorder("reorder", "conv2", {data_types::f32, format::bfyx, {output_b, output_f, output_x, output_x}}); + + topology topology_test(input, weights, input_reorder, conv1, conv2, output_reorder); + topology topology_ref(input, weights, input_reorder, conv1, conv2, output_reorder); + + build_options options_test; + implementation_desc conv1_impl_test = { format::b_fs_yx_fsv16, "", impl_types::onednn }; + implementation_desc conv2_impl_test = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16", impl_types::ocl }; + options_test.set_option(build_option::force_implementations({ {"conv1", conv1_impl_test}, {"conv2", conv2_impl_test} })); + options_test.set_option(build_option::optimize_data(true)); + + build_options options_ref; + implementation_desc conv1_impl_ref = { format::bfyx, "", impl_types::ocl }; + implementation_desc conv2_impl_ref = { format::bfyx, "", impl_types::ocl }; + options_ref.set_option(build_option::force_implementations({ {"conv1", conv1_impl_ref}, {"conv2", conv2_impl_ref} })); + options_ref.set_option(build_option::optimize_data(true)); + + network network_test(engine, topology_test, options_test); + network network_ref(engine, topology_ref, options_ref); + + network_test.set_input_data("input", input_mem); + network_ref.set_input_data("input", input_mem); + + auto outputs_test = network_test.execute(); + auto outputs_ref = network_ref.execute(); + + EXPECT_EQ(outputs_test.size(), size_t(1)); + EXPECT_EQ(outputs_test.begin()->first, "reorder"); + EXPECT_EQ(outputs_ref.size(), size_t(1)); + EXPECT_EQ(outputs_ref.begin()->first, "reorder"); + + auto output_memory_test = outputs_test.at("reorder").get_memory(); + auto output_layout_test = output_memory_test->get_layout(); + cldnn::mem_lock output_ptr_test(output_memory_test, get_test_stream()); + + auto output_memory_ref = outputs_ref.at("reorder").get_memory(); + auto output_layout_ref = output_memory_ref->get_layout(); + cldnn::mem_lock output_ptr_ref(output_memory_ref, get_test_stream()); + + EXPECT_EQ(output_layout_test.size.spatial[0], output_x); + EXPECT_EQ(output_layout_test.size.spatial[1], output_y); + EXPECT_EQ(output_layout_test.size.feature[0], output_f); + EXPECT_EQ(output_layout_test.size.batch[0], output_b); + + EXPECT_EQ(output_layout_ref.size.spatial[0], output_x); + EXPECT_EQ(output_layout_ref.size.spatial[1], output_y); + EXPECT_EQ(output_layout_ref.size.feature[0], output_f); + EXPECT_EQ(output_layout_ref.size.batch[0], output_b); + + for (size_t i = 0; i < output_memory_ref->count(); i++) { + ASSERT_EQ(output_ptr_ref.data()[i], output_ptr_test.data()[i]); + } +} + #endif // ENABLE_ONEDNN_FOR_GPU