[GPU] Fix scatter update axis name (#21398)

* fix scatter update axis name
* Remove _exec_deps control for backedge_from because this is not required after PR-21333
  * Previously, we organized execution order based on dependency for input-output buffer sharing from loop-body
  * With PR-21333, we are not sharing buffer between input and output when loop input has multiple outputs.
  * Therefore, there is no need to adjust exec order.
This commit is contained in:
Wilson Seok 2023-12-04 13:49:58 +09:00 committed by GitHub
parent 1e2b7c66f7
commit e3988cd6a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 31 deletions

View File

@ -326,8 +326,6 @@ public:
// num_iteration is used for slicing input memory
int64_t get_num_iterations();
void update_backedge_exec_deps(const cldnn::program_node& node, const cldnn::primitive_id& backedge_from_prim_id);
std::vector<event::ptr> preprocess_memory_for_body_network(int64_t current_iteration_idx);
std::vector<event::ptr> postprocess_memory_for_body_network(int64_t current_iteration_idx);

View File

@ -591,35 +591,6 @@ void loop_inst::preprocess_backedge_memory() {
GPU_DEBUG_LOG << idx << ") add back_edge mapping with SINGLE_SHARED type, backedge_mem("
<< backedge_mem << "), initial_mem(" << initial_mem << ")" << std::endl;
}
if (backedge_to_prim->_node != nullptr) {
update_backedge_exec_deps(backedge_to_prim->get_node(), backedge_from_prim->id());
}
}
}
void loop_inst::update_backedge_exec_deps(const cldnn::program_node& node, const cldnn::primitive_id& backedge_from_prim_id) {
// Add _exec_deps for backedge primitives to prevent early execution in body network
// In below topology, input and result has shared memory as they are backedge_to/from.
// When op2 executes earlier thant op1, input is updated with op2 result and op1 has wrong input value if there is no proper _exec_deps.
// input(backedge_to) ------> op1 ----->
// |
// L-----> op2 -----> result (backedge_from)
for (auto& user : node.get_users()) {
if (user->can_be_optimized()) {
// Run until non opt out user
update_backedge_exec_deps(*user, backedge_from_prim_id);
} else {
auto user_primitive_id = user->get_primitive()->id;
auto user_primitive = body_network->get_primitive(user_primitive_id);
const auto backedge_from_prim = body_network->get_primitive(backedge_from_prim_id);
if (std::find(backedge_from_prim->_exec_deps.begin(), backedge_from_prim->_exec_deps.end(), user_primitive)
!= backedge_from_prim->_exec_deps.end()) {
backedge_from_prim->_exec_dep_ids.push_back(user_primitive_id);
backedge_from_prim->_exec_deps.push_back(user_primitive);
}
}
}
}

View File

@ -492,6 +492,8 @@ enum class ScatterUpdateAxis {
Y,
Z,
W,
U,
V,
FEATURE,
BATCH,
};

View File

@ -1872,3 +1872,89 @@ TEST(scatter_update_gpu_fp32, output_padding) {
}
}
}
TEST(scatter_update_gpu_fp32, d8111_axisB_first_iteration_kernel_check) {
// Dictionary : 8x1x1x1
// Indexes : 4x1x1x1
// Updates : 4x1x1x1
// Axis : 0
// Output : 8x1x1x1
// Input values in fp32
// Indexes:
// 4.f, 3.f, 1.f, 7.f
//
// Updates:
// 9.f, 10.f, 11.f, 12.f
//
// Dictionary:
// 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f
//
// Output:
// 1.f, 11.f, 3.f, 10.f, 9.f, 6.f, 7.f, 12.f
auto& engine = get_test_engine();
for(const auto target_format : formats2D) {
auto input1 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{8, 1, 1, 1}}); // Dictionary
auto input2 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{1, 1, 1, 1}}); // Indexes
auto input3 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{1, 1, 1, 1}}); // Updates
auto axis = 0;
set_values(input1, {
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f
});
set_values(input2, {
4.f
});
set_values(input3, {
9.0f
});
topology topology;
topology.add(input_layout("InputDictionary", input1->get_layout()));
topology.add(input_layout("InputText", input2->get_layout()));
topology.add(input_layout("InputUpdates", input3->get_layout()));
topology.add(reorder("DictionaryReordered", input_info("InputDictionary"), target_format, data_types::f32));
topology.add(reorder("TextReordered", input_info("InputText"), target_format, data_types::f32));
topology.add(reorder("UpdatesReordered", input_info("InputUpdates"), target_format, data_types::f32));
topology.add(
scatter_update("scatter_update", input_info("DictionaryReordered"), input_info("TextReordered"), input_info("UpdatesReordered"), axis)
);
topology.add(reorder("out", input_info("scatter_update"), plain_2d_format, data_types::f32));
network network(engine, topology, get_test_default_config(engine));
network.set_input_data("InputDictionary", input1);
network.set_input_data("InputText", input2);
network.set_input_data("InputUpdates", input3);
// allocate new output memory
layout out_l = network.get_output_memory("out")->get_layout();
//auto output_mem = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{8, 1, 1, 1}});
auto output_mem = engine.allocate_memory(out_l);
set_values(output_mem, {
-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f
});
network.set_output_memory("out", output_mem);
auto outputs = network.execute();
auto output = outputs.at("out").get_memory();
ASSERT_TRUE(engine.is_the_same_buffer(*output_mem, *output));
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
std::vector<float> expected_results = {
1.0f, 2.0f, 3.0f, 4.0f, 9.0f, 6.0f, 7.0f, 8.0f
};
for (size_t i = 0; i < expected_results.size(); ++i) {
ASSERT_EQ(expected_results[i], output_ptr[i])
<< "i=" << i << ", target_format=" << target_format;
}
}
}