[GPU] Fix scatter update axis name (#21398)

* fix scatter update axis name * Remove _exec_deps control for backedge_from because this is not required after PR-21333 * Previously, we organized execution order based on dependency for input-output buffer sharing from loop-body * With PR-21333, we are not sharing buffer between input and output when loop input has multiple outputs. * Therefore, there is no need to adjust exec order.
2023-12-04 13:49:58 +09:00 · 2023-12-04 13:49:58 +09:00 · e3988cd6a8
commit e3988cd6a8
parent 1e2b7c66f7
4 changed files with 88 additions and 31 deletions
--- a/src/plugins/intel_gpu/src/graph/include/loop_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/loop_inst.h
@ -326,8 +326,6 @@ public:
    // num_iteration is used for slicing input memory
    int64_t get_num_iterations();

-    void update_backedge_exec_deps(const cldnn::program_node& node, const cldnn::primitive_id& backedge_from_prim_id);
-
    std::vector<event::ptr> preprocess_memory_for_body_network(int64_t current_iteration_idx);
    std::vector<event::ptr> postprocess_memory_for_body_network(int64_t current_iteration_idx);

--- a/src/plugins/intel_gpu/src/graph/loop.cpp
+++ b/src/plugins/intel_gpu/src/graph/loop.cpp
@ -591,35 +591,6 @@ void loop_inst::preprocess_backedge_memory() {
            GPU_DEBUG_LOG << idx << ") add back_edge mapping with SINGLE_SHARED type, backedge_mem("
                            << backedge_mem << "), initial_mem(" << initial_mem << ")" << std::endl;
        }
-
-        if (backedge_to_prim->_node != nullptr) {
-            update_backedge_exec_deps(backedge_to_prim->get_node(), backedge_from_prim->id());
-        }
-    }
-}
-
-void loop_inst::update_backedge_exec_deps(const cldnn::program_node& node, const cldnn::primitive_id& backedge_from_prim_id) {
-    // Add _exec_deps for backedge primitives to prevent early execution in body network
-    // In below topology, input and result has shared memory as they are backedge_to/from.
-    // When op2 executes earlier thant op1, input is updated with op2 result and op1 has wrong input value if there is no proper _exec_deps.
-    // input(backedge_to) ------> op1 ----->
-    //                    |
-    //                    L-----> op2 -----> result (backedge_from)
-    for (auto& user : node.get_users()) {
-        if (user->can_be_optimized()) {
-            // Run until non opt out user
-            update_backedge_exec_deps(*user, backedge_from_prim_id);
-        } else {
-            auto user_primitive_id = user->get_primitive()->id;
-            auto user_primitive = body_network->get_primitive(user_primitive_id);
-
-            const auto backedge_from_prim = body_network->get_primitive(backedge_from_prim_id);
-            if (std::find(backedge_from_prim->_exec_deps.begin(), backedge_from_prim->_exec_deps.end(), user_primitive)
-                != backedge_from_prim->_exec_deps.end()) {
-                backedge_from_prim->_exec_dep_ids.push_back(user_primitive_id);
-                backedge_from_prim->_exec_deps.push_back(user_primitive);
-            }
-        }
    }
 }

--- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
@ -492,6 +492,8 @@ enum class ScatterUpdateAxis {
    Y,
    Z,
    W,
+    U,
+    V,
    FEATURE,
    BATCH,
 };
--- a/src/plugins/intel_gpu/tests/unit/test_cases/scatter_update_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/scatter_update_gpu_test.cpp
@ -1872,3 +1872,89 @@ TEST(scatter_update_gpu_fp32, output_padding) {
        }
    }
 }
+
+TEST(scatter_update_gpu_fp32, d8111_axisB_first_iteration_kernel_check) {
+    //  Dictionary : 8x1x1x1
+    //  Indexes : 4x1x1x1
+    //  Updates : 4x1x1x1
+    //  Axis : 0
+    //  Output : 8x1x1x1
+    //  Input values in fp32
+
+    //  Indexes:
+    //  4.f, 3.f, 1.f, 7.f
+    //
+    //  Updates:
+    //  9.f, 10.f, 11.f, 12.f
+    //
+    //  Dictionary:
+    //  1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f
+    //
+    //  Output:
+    //  1.f, 11.f, 3.f, 10.f, 9.f, 6.f, 7.f, 12.f
+
+
+    auto& engine = get_test_engine();
+
+    for(const auto target_format : formats2D) {
+        auto input1 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{8, 1, 1, 1}}); // Dictionary
+        auto input2 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{1, 1, 1, 1}}); // Indexes
+        auto input3 = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{1, 1, 1, 1}}); // Updates
+        auto axis = 0;
+
+        set_values(input1, {
+                1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f
+        });
+
+        set_values(input2, {
+                4.f
+        });
+
+        set_values(input3, {
+                9.0f
+        });
+
+        topology topology;
+        topology.add(input_layout("InputDictionary", input1->get_layout()));
+        topology.add(input_layout("InputText", input2->get_layout()));
+        topology.add(input_layout("InputUpdates", input3->get_layout()));
+        topology.add(reorder("DictionaryReordered", input_info("InputDictionary"), target_format, data_types::f32));
+        topology.add(reorder("TextReordered", input_info("InputText"), target_format, data_types::f32));
+        topology.add(reorder("UpdatesReordered", input_info("InputUpdates"), target_format, data_types::f32));
+        topology.add(
+                scatter_update("scatter_update", input_info("DictionaryReordered"), input_info("TextReordered"), input_info("UpdatesReordered"), axis)
+        );
+        topology.add(reorder("out", input_info("scatter_update"), plain_2d_format, data_types::f32));
+
+        network network(engine, topology, get_test_default_config(engine));
+
+
+        network.set_input_data("InputDictionary", input1);
+        network.set_input_data("InputText", input2);
+        network.set_input_data("InputUpdates", input3);
+
+        // allocate new output memory
+        layout out_l = network.get_output_memory("out")->get_layout();
+        //auto output_mem = engine.allocate_memory({data_types::f32, plain_2d_format, tensor{8, 1, 1, 1}}); 
+        auto output_mem = engine.allocate_memory(out_l);
+        set_values(output_mem, {
+                -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f
+        });
+        
+        network.set_output_memory("out", output_mem);
+        auto outputs = network.execute();
+
+        auto output = outputs.at("out").get_memory();
+        ASSERT_TRUE(engine.is_the_same_buffer(*output_mem, *output));
+        cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+        std::vector<float> expected_results = {
+                1.0f, 2.0f, 3.0f, 4.0f, 9.0f, 6.0f, 7.0f, 8.0f
+        };
+
+        for (size_t i = 0; i < expected_results.size(); ++i) {
+            ASSERT_EQ(expected_results[i], output_ptr[i])
+                                << "i=" << i << ", target_format=" << target_format;
+        }
+    }
+}