[GPU] Fix bug in reorder_redundant_reorder (#17329)

* Fix bug
1) reshape w/ fused primiitive should not be optimized out
2) Wrong usage of slice mem / concat mem in loop
3) LWS not set in lstm_elt

* Added unittest
This commit is contained in:
Taylor Yeonbok Lee 2023-05-03 16:43:34 -07:00 committed by GitHub
parent 7464b4d396
commit 9062b81edb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 52 additions and 9 deletions

View File

@ -637,7 +637,7 @@ void remove_redundant_reorders::run(program& p) {
!reshape_input_node.has_fused_primitives();
bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() &&
reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() &&
reshape_node.has_fused_primitives();
!reshape_node.has_fused_primitives();
if (remove_dep) {
LOG_NODE_REMOVAL(reshape_input_node.id());

View File

@ -124,7 +124,7 @@ struct loop_impl : typed_primitive_impl<loop> {
// Set sliced output memory
for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) {
concat_output_mem_mapping.setup_concatenated_output_memory(current_iteration_idx);
concat_output_mem_mapping.setup_sliced_output_memory(current_iteration_idx);
}
// execute body network

View File

@ -470,9 +470,9 @@ private:
}
}
void setup_concatenated_output_memory(uint64_t iteration) const {
void setup_sliced_output_memory(uint64_t iteration) const {
const auto& sliced_output_mem = sliced_mems.at(iteration);
concat_data_prim->set_output_memory(sliced_output_mem);
sliced_data_prim->set_output_memory(sliced_output_mem);
}
memory::ptr get_sliced_mem(int64_t iteration) const {

View File

@ -232,7 +232,7 @@ void loop_inst::update_mapped_memory() {
body_network->get_primitive(internal_id)->set_output_memory(to_mem);
} else {
for (auto& mem_mapping : concatenated_output_mem_mappings) {
if (mem_mapping.concat_data_prim->id() == internal_id) {
if (mem_mapping.sliced_data_prim->id() == internal_id) {
mem_mapping.concatenated_mem = to_mem;
break;
}
@ -339,7 +339,7 @@ void loop_inst::preprocess_output_memory() {
const int64_t max_iteration = _max_iteration;
std::vector<memory::ptr> sliced_mems;
sliced_mems.reserve(max_iteration);
for (int j=0; j < max_iteration; ++j) {
for (int32_t j = 0; j < max_iteration; ++j) {
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0);
sliced_mems.push_back(sliced_mem);
}
@ -351,7 +351,8 @@ void loop_inst::preprocess_output_memory() {
concatenated_memory_mapping memory_mapping_info(
output_mapping.axis, to_mem, sliced_mems, _network.get_stream(),
num_elements_iteration, output_mapping.stride, start);
memory_mapping_info.concat_data_prim = body_network->get_primitive(internal_id);
memory_mapping_info.sliced_data_prim = body_network->get_primitive(internal_id);
memory_mapping_info.concat_data_prim = get_network().get_primitive(external_id);
concatenated_output_mem_mappings.push_back(memory_mapping_info);
}
}
@ -467,7 +468,7 @@ std::vector<memory::ptr> loop_inst::get_sliced_mem(const primitive_id& internal_
}
}
for (const auto& mem_mapping : concatenated_output_mem_mappings) {
if (mem_mapping.concat_data_prim->id() == internal_id) {
if (mem_mapping.sliced_data_prim->id() == internal_id) {
return mem_mapping.sliced_mems;
}
}

View File

@ -81,6 +81,7 @@ KernelsData LSTMEltKernelBase::GetCommonKernelsData(const Params& params, const
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
kernel.params.workGroups.global = {out.X().v, out.Batch().v, 1};
kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global, params.engineInfo);
kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});

View File

@ -16,7 +16,7 @@
#include "fully_connected_inst.h"
#include "convolution_inst.h"
#include "permute_inst.h"
#include "reshape_inst.h"
#include "pass_manager.h"
#include "to_string_utils.h"
@ -137,3 +137,44 @@ TEST(remove_redundant_reorders, skip_reorder_fusing_when_sibling_not_support_pad
ASSERT_EQ(prog->get_node("convolution").get_output_layout().data_padding, padding());
}
TEST(remove_redundant_reorders, not_to_fuse_reshape_with_fused_prims) {
auto& engine = get_test_engine();
auto data0_layout = engine.allocate_memory({ ov::PartialShape{1, 32, 2, 2}, data_types::f16, format::bfyx });
auto in_layout = layout{ ov::PartialShape{1, 32, 2, 2}, data_types::f16, format::bfyx };
topology topology;
topology.add(input_layout("input", in_layout));
topology.add(data("data0", data0_layout));
topology.add(eltwise("elt", input_info("input"), input_info("data0"), eltwise_mode::sum));
topology.add(reorder("reorder", input_info("elt"), { data_types::f16, format::bfzyx, {1, 1, 32, 2, 2}}));
topology.add(reshape("reshape1", input_info("reorder"), {1, 4, 16, 2}));
topology.add(reorder("reorder2", input_info("reshape1"), { data_types::f16, format::bfzyx, {1, 1, 32, 2, 2}}));
topology.add(reshape("reshape2", input_info("reorder2"), {1, 32, 2, 2, 1}));
topology.add(activation("activation", input_info("reshape2"), activation_func::relu));
topology.add(reorder("reorder4", input_info("activation"), { data_types::f32, format::bfyx, {1, 4, 32, 1}}));
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
auto prog = program::build_program(engine, topology, config, false, true);
layout_optimizer lo(true);
program_wrapper::apply_opt_pass<prepare_primitive_fusing>(*prog, lo);
bool optimize_data = config.get_property(ov::intel_gpu::optimize_data);
program_wrapper::apply_opt_pass<remove_redundant_reorders>(*prog, lo, optimize_data);
ASSERT_NE(prog, nullptr);
ASSERT_TRUE(has_node_with_type<reshape>(*prog));
network network(engine, topology, config);
auto input = engine.allocate_memory(in_layout);
VVVVF<float> input_all_neg = generate_random_4d<float>(1, 32, 2, 2, -10.f, 0.f);
set_values(input, input_all_neg);
network.set_input_data("input", input);
auto outputs = network.execute();
auto output_prim = outputs.begin()->second.get_memory();
cldnn::mem_lock<float> output_ptr(output_prim, get_test_stream());
for (size_t i = 0; i < output_ptr.size(); ++i) {
ASSERT_GE(output_ptr[i], 0);
}
}