From 9062b81edb7f99994a9ad3ecb634f6ca84742393 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Wed, 3 May 2023 16:43:34 -0700 Subject: [PATCH] [GPU] Fix bug in reorder_redundant_reorder (#17329) * Fix bug 1) reshape w/ fused primiitive should not be optimized out 2) Wrong usage of slice mem / concat mem in loop 3) LWS not set in lstm_elt * Added unittest --- .../remove_redundant_reorders.cpp | 2 +- .../intel_gpu/src/graph/impls/common/loop.cpp | 2 +- .../intel_gpu/src/graph/include/loop_inst.h | 4 +- src/plugins/intel_gpu/src/graph/loop.cpp | 9 ++-- .../kernels/lstm/lstm_elt_kernel_base.cpp | 1 + .../remove_redundant_reorders_tests.cpp | 43 ++++++++++++++++++- 6 files changed, 52 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index c118e1ca9d7..8d9c0a1a9a8 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -637,7 +637,7 @@ void remove_redundant_reorders::run(program& p) { !reshape_input_node.has_fused_primitives(); bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() && reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() && - reshape_node.has_fused_primitives(); + !reshape_node.has_fused_primitives(); if (remove_dep) { LOG_NODE_REMOVAL(reshape_input_node.id()); diff --git a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp index eda9ed6f775..71a5e163a51 100644 --- a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp @@ -124,7 +124,7 @@ struct loop_impl : typed_primitive_impl { // Set sliced output memory for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) { - concat_output_mem_mapping.setup_concatenated_output_memory(current_iteration_idx); + concat_output_mem_mapping.setup_sliced_output_memory(current_iteration_idx); } // execute body network diff --git a/src/plugins/intel_gpu/src/graph/include/loop_inst.h b/src/plugins/intel_gpu/src/graph/include/loop_inst.h index 15f18f4b6c7..074941539eb 100644 --- a/src/plugins/intel_gpu/src/graph/include/loop_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/loop_inst.h @@ -470,9 +470,9 @@ private: } } - void setup_concatenated_output_memory(uint64_t iteration) const { + void setup_sliced_output_memory(uint64_t iteration) const { const auto& sliced_output_mem = sliced_mems.at(iteration); - concat_data_prim->set_output_memory(sliced_output_mem); + sliced_data_prim->set_output_memory(sliced_output_mem); } memory::ptr get_sliced_mem(int64_t iteration) const { diff --git a/src/plugins/intel_gpu/src/graph/loop.cpp b/src/plugins/intel_gpu/src/graph/loop.cpp index 38ed946c836..304cfb6b789 100644 --- a/src/plugins/intel_gpu/src/graph/loop.cpp +++ b/src/plugins/intel_gpu/src/graph/loop.cpp @@ -232,7 +232,7 @@ void loop_inst::update_mapped_memory() { body_network->get_primitive(internal_id)->set_output_memory(to_mem); } else { for (auto& mem_mapping : concatenated_output_mem_mappings) { - if (mem_mapping.concat_data_prim->id() == internal_id) { + if (mem_mapping.sliced_data_prim->id() == internal_id) { mem_mapping.concatenated_mem = to_mem; break; } @@ -339,7 +339,7 @@ void loop_inst::preprocess_output_memory() { const int64_t max_iteration = _max_iteration; std::vector sliced_mems; sliced_mems.reserve(max_iteration); - for (int j=0; j < max_iteration; ++j) { + for (int32_t j = 0; j < max_iteration; ++j) { memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0); sliced_mems.push_back(sliced_mem); } @@ -351,7 +351,8 @@ void loop_inst::preprocess_output_memory() { concatenated_memory_mapping memory_mapping_info( output_mapping.axis, to_mem, sliced_mems, _network.get_stream(), num_elements_iteration, output_mapping.stride, start); - memory_mapping_info.concat_data_prim = body_network->get_primitive(internal_id); + memory_mapping_info.sliced_data_prim = body_network->get_primitive(internal_id); + memory_mapping_info.concat_data_prim = get_network().get_primitive(external_id); concatenated_output_mem_mappings.push_back(memory_mapping_info); } } @@ -467,7 +468,7 @@ std::vector loop_inst::get_sliced_mem(const primitive_id& internal_ } } for (const auto& mem_mapping : concatenated_output_mem_mappings) { - if (mem_mapping.concat_data_prim->id() == internal_id) { + if (mem_mapping.sliced_data_prim->id() == internal_id) { return mem_mapping.sliced_mems; } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp index 3413da99630..c94361a90b4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_elt_kernel_base.cpp @@ -81,6 +81,7 @@ KernelsData LSTMEltKernelBase::GetCommonKernelsData(const Params& params, const auto jit = CreateJit(kernelName, cldnnJit, entryPoint); kernel.params.workGroups.global = {out.X().v, out.Batch().v, 1}; + kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global, params.engineInfo); kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); diff --git a/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp b/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp index 435db49fec2..ee4b1293d86 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/remove_redundant_reorders_tests.cpp @@ -16,7 +16,7 @@ #include "fully_connected_inst.h" #include "convolution_inst.h" #include "permute_inst.h" - +#include "reshape_inst.h" #include "pass_manager.h" #include "to_string_utils.h" @@ -137,3 +137,44 @@ TEST(remove_redundant_reorders, skip_reorder_fusing_when_sibling_not_support_pad ASSERT_EQ(prog->get_node("convolution").get_output_layout().data_padding, padding()); } + +TEST(remove_redundant_reorders, not_to_fuse_reshape_with_fused_prims) { + auto& engine = get_test_engine(); + auto data0_layout = engine.allocate_memory({ ov::PartialShape{1, 32, 2, 2}, data_types::f16, format::bfyx }); + auto in_layout = layout{ ov::PartialShape{1, 32, 2, 2}, data_types::f16, format::bfyx }; + + topology topology; + topology.add(input_layout("input", in_layout)); + topology.add(data("data0", data0_layout)); + topology.add(eltwise("elt", input_info("input"), input_info("data0"), eltwise_mode::sum)); + topology.add(reorder("reorder", input_info("elt"), { data_types::f16, format::bfzyx, {1, 1, 32, 2, 2}})); + topology.add(reshape("reshape1", input_info("reorder"), {1, 4, 16, 2})); + topology.add(reorder("reorder2", input_info("reshape1"), { data_types::f16, format::bfzyx, {1, 1, 32, 2, 2}})); + topology.add(reshape("reshape2", input_info("reorder2"), {1, 32, 2, 2, 1})); + topology.add(activation("activation", input_info("reshape2"), activation_func::relu)); + topology.add(reorder("reorder4", input_info("activation"), { data_types::f32, format::bfyx, {1, 4, 32, 1}})); + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + auto prog = program::build_program(engine, topology, config, false, true); + + layout_optimizer lo(true); + + program_wrapper::apply_opt_pass(*prog, lo); + bool optimize_data = config.get_property(ov::intel_gpu::optimize_data); + program_wrapper::apply_opt_pass(*prog, lo, optimize_data); + + ASSERT_NE(prog, nullptr); + ASSERT_TRUE(has_node_with_type(*prog)); + network network(engine, topology, config); + + auto input = engine.allocate_memory(in_layout); + VVVVF input_all_neg = generate_random_4d(1, 32, 2, 2, -10.f, 0.f); + set_values(input, input_all_neg); + network.set_input_data("input", input); + auto outputs = network.execute(); + auto output_prim = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr(output_prim, get_test_stream()); + for (size_t i = 0; i < output_ptr.size(); ++i) { + ASSERT_GE(output_ptr[i], 0); + } +}