[GPU] Fix bug in reorder_redundant_reorder (#17329)
* Fix bug 1) reshape w/ fused primiitive should not be optimized out 2) Wrong usage of slice mem / concat mem in loop 3) LWS not set in lstm_elt * Added unittest
This commit is contained in:
parent
7464b4d396
commit
9062b81edb
@ -637,7 +637,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
!reshape_input_node.has_fused_primitives();
|
||||
bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() &&
|
||||
reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() &&
|
||||
reshape_node.has_fused_primitives();
|
||||
!reshape_node.has_fused_primitives();
|
||||
|
||||
if (remove_dep) {
|
||||
LOG_NODE_REMOVAL(reshape_input_node.id());
|
||||
|
@ -124,7 +124,7 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
|
||||
// Set sliced output memory
|
||||
for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) {
|
||||
concat_output_mem_mapping.setup_concatenated_output_memory(current_iteration_idx);
|
||||
concat_output_mem_mapping.setup_sliced_output_memory(current_iteration_idx);
|
||||
}
|
||||
|
||||
// execute body network
|
||||
|
@ -470,9 +470,9 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
void setup_concatenated_output_memory(uint64_t iteration) const {
|
||||
void setup_sliced_output_memory(uint64_t iteration) const {
|
||||
const auto& sliced_output_mem = sliced_mems.at(iteration);
|
||||
concat_data_prim->set_output_memory(sliced_output_mem);
|
||||
sliced_data_prim->set_output_memory(sliced_output_mem);
|
||||
}
|
||||
|
||||
memory::ptr get_sliced_mem(int64_t iteration) const {
|
||||
|
@ -232,7 +232,7 @@ void loop_inst::update_mapped_memory() {
|
||||
body_network->get_primitive(internal_id)->set_output_memory(to_mem);
|
||||
} else {
|
||||
for (auto& mem_mapping : concatenated_output_mem_mappings) {
|
||||
if (mem_mapping.concat_data_prim->id() == internal_id) {
|
||||
if (mem_mapping.sliced_data_prim->id() == internal_id) {
|
||||
mem_mapping.concatenated_mem = to_mem;
|
||||
break;
|
||||
}
|
||||
@ -339,7 +339,7 @@ void loop_inst::preprocess_output_memory() {
|
||||
const int64_t max_iteration = _max_iteration;
|
||||
std::vector<memory::ptr> sliced_mems;
|
||||
sliced_mems.reserve(max_iteration);
|
||||
for (int j=0; j < max_iteration; ++j) {
|
||||
for (int32_t j = 0; j < max_iteration; ++j) {
|
||||
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0);
|
||||
sliced_mems.push_back(sliced_mem);
|
||||
}
|
||||
@ -351,7 +351,8 @@ void loop_inst::preprocess_output_memory() {
|
||||
concatenated_memory_mapping memory_mapping_info(
|
||||
output_mapping.axis, to_mem, sliced_mems, _network.get_stream(),
|
||||
num_elements_iteration, output_mapping.stride, start);
|
||||
memory_mapping_info.concat_data_prim = body_network->get_primitive(internal_id);
|
||||
memory_mapping_info.sliced_data_prim = body_network->get_primitive(internal_id);
|
||||
memory_mapping_info.concat_data_prim = get_network().get_primitive(external_id);
|
||||
concatenated_output_mem_mappings.push_back(memory_mapping_info);
|
||||
}
|
||||
}
|
||||
@ -467,7 +468,7 @@ std::vector<memory::ptr> loop_inst::get_sliced_mem(const primitive_id& internal_
|
||||
}
|
||||
}
|
||||
for (const auto& mem_mapping : concatenated_output_mem_mappings) {
|
||||
if (mem_mapping.concat_data_prim->id() == internal_id) {
|
||||
if (mem_mapping.sliced_data_prim->id() == internal_id) {
|
||||
return mem_mapping.sliced_mems;
|
||||
}
|
||||
}
|
||||
|
@ -81,6 +81,7 @@ KernelsData LSTMEltKernelBase::GetCommonKernelsData(const Params& params, const
|
||||
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
|
||||
|
||||
kernel.params.workGroups.global = {out.X().v, out.Batch().v, 1};
|
||||
kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global, params.engineInfo);
|
||||
kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
|
||||
kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
|
||||
kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
|
||||
|
@ -16,7 +16,7 @@
|
||||
#include "fully_connected_inst.h"
|
||||
#include "convolution_inst.h"
|
||||
#include "permute_inst.h"
|
||||
|
||||
#include "reshape_inst.h"
|
||||
#include "pass_manager.h"
|
||||
#include "to_string_utils.h"
|
||||
|
||||
@ -137,3 +137,44 @@ TEST(remove_redundant_reorders, skip_reorder_fusing_when_sibling_not_support_pad
|
||||
|
||||
ASSERT_EQ(prog->get_node("convolution").get_output_layout().data_padding, padding());
|
||||
}
|
||||
|
||||
TEST(remove_redundant_reorders, not_to_fuse_reshape_with_fused_prims) {
|
||||
auto& engine = get_test_engine();
|
||||
auto data0_layout = engine.allocate_memory({ ov::PartialShape{1, 32, 2, 2}, data_types::f16, format::bfyx });
|
||||
auto in_layout = layout{ ov::PartialShape{1, 32, 2, 2}, data_types::f16, format::bfyx };
|
||||
|
||||
topology topology;
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(data("data0", data0_layout));
|
||||
topology.add(eltwise("elt", input_info("input"), input_info("data0"), eltwise_mode::sum));
|
||||
topology.add(reorder("reorder", input_info("elt"), { data_types::f16, format::bfzyx, {1, 1, 32, 2, 2}}));
|
||||
topology.add(reshape("reshape1", input_info("reorder"), {1, 4, 16, 2}));
|
||||
topology.add(reorder("reorder2", input_info("reshape1"), { data_types::f16, format::bfzyx, {1, 1, 32, 2, 2}}));
|
||||
topology.add(reshape("reshape2", input_info("reorder2"), {1, 32, 2, 2, 1}));
|
||||
topology.add(activation("activation", input_info("reshape2"), activation_func::relu));
|
||||
topology.add(reorder("reorder4", input_info("activation"), { data_types::f32, format::bfyx, {1, 4, 32, 1}}));
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
auto prog = program::build_program(engine, topology, config, false, true);
|
||||
|
||||
layout_optimizer lo(true);
|
||||
|
||||
program_wrapper::apply_opt_pass<prepare_primitive_fusing>(*prog, lo);
|
||||
bool optimize_data = config.get_property(ov::intel_gpu::optimize_data);
|
||||
program_wrapper::apply_opt_pass<remove_redundant_reorders>(*prog, lo, optimize_data);
|
||||
|
||||
ASSERT_NE(prog, nullptr);
|
||||
ASSERT_TRUE(has_node_with_type<reshape>(*prog));
|
||||
network network(engine, topology, config);
|
||||
|
||||
auto input = engine.allocate_memory(in_layout);
|
||||
VVVVF<float> input_all_neg = generate_random_4d<float>(1, 32, 2, 2, -10.f, 0.f);
|
||||
set_values(input, input_all_neg);
|
||||
network.set_input_data("input", input);
|
||||
auto outputs = network.execute();
|
||||
auto output_prim = outputs.begin()->second.get_memory();
|
||||
cldnn::mem_lock<float> output_ptr(output_prim, get_test_stream());
|
||||
for (size_t i = 0; i < output_ptr.size(); ++i) {
|
||||
ASSERT_GE(output_ptr[i], 0);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user