[GPU] Fix loop issues (#21091)

* [GPU] Fix loop issues * if loop does not run inner body, return zero dimension layout for dynamic dimension layout * Support dynamic sliced input in inner body in creaet concate mapping * Modify update_shape to call reset_shape_changed() when inputs of loop are not changed * Add unit test * Follow-up codereview
2023-11-17 17:16:58 +09:00 · 2023-11-17 17:16:58 +09:00 · 8b5b7a627b
commit 8b5b7a627b
parent 60730f61a4
3 changed files with 54 additions and 10 deletions
--- a/src/plugins/intel_gpu/src/graph/loop.cpp
+++ b/src/plugins/intel_gpu/src/graph/loop.cpp
@ -23,7 +23,6 @@ std::map<size_t, memory::ptr> loop_node::get_memory_deps() const {
    auto memory_deps = get_const_memory_deps();
    for (auto& i : get_shape_infer_dependencies()) {
        auto& dep = get_dependency(i);
-        auto dep_id = dep.id();
        if (memory_deps.count(i) > 0 || i >= get_dependencies().size()) {
            continue;
        }
@ -91,6 +90,17 @@ static std::vector<layout> get_output_layouts(kernel_impl_params const& impl_par
            auto shape = loop_output_layout.get_partial_shape();
            shape[axis_to_iterate_through] = static_cast<int32_t>(num_iterations);
            loop_output_layout.set_partial_shape(shape);
+        } else {
+            // if num_iterations is zero, it means loop does not run inner body network.
+            // in the case of dynamic output layout, dynamic dimension will be replaced to zero.
+            if (num_iterations == 0) {
+                auto shape = loop_output_layout.get_partial_shape();
+                for (size_t i = 0; i < shape.size(); i++) {
+                    if (shape[i].is_dynamic())
+                        shape[i] = 0;
+                }
+                loop_output_layout.set_partial_shape(shape);
+            }
        }
        output_layouts.push_back(loop_output_layout);
    }
@ -348,6 +358,7 @@ event::ptr loop_inst::set_output_memory(memory::ptr mem, bool check, size_t idx)
 loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(const cldnn::loop::io_primitive_map& io_prim_map,
                                                                                    memory::ptr mem_ptr,
                                                                                    const int64_t num_iterations) {
+    OPENVINO_ASSERT(io_prim_map.axis >= 0, "axis should not be negative");
    const auto& external_id = io_prim_map.external_id;
    const auto& internal_id = io_prim_map.internal_id;
    auto& engine = body_network->get_engine();
@ -360,16 +371,37 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
    // In dynamic model, we can't calculate num_element_iteration, start, and sliced_layout.
    // will recalculate that parameters in backedge preprocessing map after first execution.
    if (mem_ptr != nullptr) {
-        auto& out_mem = prim->output_memory(internal_id.idx);
-        layout sliced_layout = out_mem.get_layout();
+        layout sliced_layout = prim->get_output_layout(internal_id.idx);
+        auto out_mem_ptr = prim->output_memory_ptr(internal_id.idx);
+        if (out_mem_ptr != nullptr) {
+            sliced_layout = out_mem_ptr->get_layout();
+        } else {
+            // if inner body prim has no output memory because it has dynamic shape,
+            // calculate inner body prim layout using concat_mem's layout.
+            auto updated_sliced_layout = sliced_layout.get_partial_shape();
+            OPENVINO_ASSERT(updated_sliced_layout[io_prim_map.axis].is_static() || num_iterations > 0,
+                                    "Not allowed dynamic dimension for axis when num_iteraiont is negative");
+            auto concat_mem_pshape = mem_ptr->get_layout().get_partial_shape();
+            const auto shape_size = concat_mem_pshape.size();
+            for (size_t i = 0; i < shape_size; i++) {
+                if (updated_sliced_layout[i].is_dynamic()) {
+                    updated_sliced_layout[i] = concat_mem_pshape[i];
+                }
+            }
+            GPU_DEBUG_LOG << "output pshape for [" << prim->id() << "] is changed from "
+                            << sliced_layout.get_partial_shape().to_string()
+                            << " to " << updated_sliced_layout.to_string() << std::endl;
+            sliced_layout.set_partial_shape(updated_sliced_layout);
+            out_mem_ptr = engine.allocate_memory(sliced_layout);
+        }

-        // When trip_count is -1, allocate first sliced_mem and allocate sliced memory if additional sliced mem is required
+        // When num_iterations is -1, allocate first sliced_mem and allocate sliced memory if additional sliced mem is required
        if (num_iterations < 0) {
-            memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
-            sliced_mems.push_back(sliced_mem);
+            sliced_mems.push_back(out_mem_ptr);
        } else {
            sliced_mems.reserve(num_iterations);
-            for (int j=0; j < num_iterations; ++j) {
+            sliced_mems.push_back(out_mem_ptr);
+            for (int j=1; j < num_iterations; ++j) {
                memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
                sliced_mems.push_back(sliced_mem);
            }
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -17,6 +17,7 @@
 #include "reshape_inst.h"
 #include "reorder_inst.h"
 #include "eltwise_inst.h"
+#include "loop_inst.h"
 #include "deconvolution_inst.h"
 #include "shape_of_inst.h"
 #include "softmax_inst.h"
@ -272,6 +273,13 @@ void primitive_inst::update_shape() {
        return;
    }

+    // if input shape is not changed, loop doesn't need to update anything.
+    // because actual output layout will be calculated after the end of body network execution.
+    if (_node->is_type<loop>() && !input_shape_changed) {
+        reset_shape_change();
+        return;
+    }
+
    // Do not update shapes in shape_of subraph if shape_of's input shape is not changed
    if (_node->is_in_shape_of_subgraph()) {
        bool subgraph_input_changed = false;
--- a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp
@ -432,11 +432,11 @@ TEST(loop_gpu, basic_concat_nested_cached) {
    test_loop_gpu_basic_concat_nested<float>(true);
 }

-static void test_loop_gpu_wo_trip_count(bool is_caching_test) {
+static void test_loop_gpu_wo_trip_count(ov::PartialShape body_input_layout, bool is_caching_test = false) {
    auto& engine = get_test_engine();

    auto e_input_layout = cldnn::layout{ { 1, 1, 5, 4 }, data_types::f32, format::bfyx };
-    auto b_input_layout = cldnn::layout{ { 1, 1, 1, 4}, data_types::f32, format::bfyx };
+    auto b_input_layout = cldnn::layout{ body_input_layout, data_types::f32, format::bfyx };
    auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx };

    auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y
@ -547,5 +547,9 @@ static void test_loop_gpu_wo_trip_count(bool is_caching_test) {
 }

 TEST(loop_gpu, support_dynamic_tensoriterator) {
-    test_loop_gpu_wo_trip_count(false);
+    test_loop_gpu_wo_trip_count({ 1, 1, 1, 4 });
+}
+
+TEST(loop_gpu, support_loop_w_dynamic_body_input) {
+    test_loop_gpu_wo_trip_count({ 1, -1, 1, 4 });
 }