[GPU] Fix loop issues (#21091)

* [GPU] Fix loop issues
* if loop does not run inner body, return zero dimension layout for dynamic dimension layout

* Support dynamic sliced input in inner body in creaet concate mapping

* Modify update_shape to call reset_shape_changed() when inputs of loop are not changed
* Add unit test

* Follow-up codereview
This commit is contained in:
Paul Youngsoo Ahn 2023-11-17 17:16:58 +09:00 committed by GitHub
parent 60730f61a4
commit 8b5b7a627b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 10 deletions

View File

@ -23,7 +23,6 @@ std::map<size_t, memory::ptr> loop_node::get_memory_deps() const {
auto memory_deps = get_const_memory_deps();
for (auto& i : get_shape_infer_dependencies()) {
auto& dep = get_dependency(i);
auto dep_id = dep.id();
if (memory_deps.count(i) > 0 || i >= get_dependencies().size()) {
continue;
}
@ -91,6 +90,17 @@ static std::vector<layout> get_output_layouts(kernel_impl_params const& impl_par
auto shape = loop_output_layout.get_partial_shape();
shape[axis_to_iterate_through] = static_cast<int32_t>(num_iterations);
loop_output_layout.set_partial_shape(shape);
} else {
// if num_iterations is zero, it means loop does not run inner body network.
// in the case of dynamic output layout, dynamic dimension will be replaced to zero.
if (num_iterations == 0) {
auto shape = loop_output_layout.get_partial_shape();
for (size_t i = 0; i < shape.size(); i++) {
if (shape[i].is_dynamic())
shape[i] = 0;
}
loop_output_layout.set_partial_shape(shape);
}
}
output_layouts.push_back(loop_output_layout);
}
@ -348,6 +358,7 @@ event::ptr loop_inst::set_output_memory(memory::ptr mem, bool check, size_t idx)
loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(const cldnn::loop::io_primitive_map& io_prim_map,
memory::ptr mem_ptr,
const int64_t num_iterations) {
OPENVINO_ASSERT(io_prim_map.axis >= 0, "axis should not be negative");
const auto& external_id = io_prim_map.external_id;
const auto& internal_id = io_prim_map.internal_id;
auto& engine = body_network->get_engine();
@ -360,16 +371,37 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
// In dynamic model, we can't calculate num_element_iteration, start, and sliced_layout.
// will recalculate that parameters in backedge preprocessing map after first execution.
if (mem_ptr != nullptr) {
auto& out_mem = prim->output_memory(internal_id.idx);
layout sliced_layout = out_mem.get_layout();
layout sliced_layout = prim->get_output_layout(internal_id.idx);
auto out_mem_ptr = prim->output_memory_ptr(internal_id.idx);
if (out_mem_ptr != nullptr) {
sliced_layout = out_mem_ptr->get_layout();
} else {
// if inner body prim has no output memory because it has dynamic shape,
// calculate inner body prim layout using concat_mem's layout.
auto updated_sliced_layout = sliced_layout.get_partial_shape();
OPENVINO_ASSERT(updated_sliced_layout[io_prim_map.axis].is_static() || num_iterations > 0,
"Not allowed dynamic dimension for axis when num_iteraiont is negative");
auto concat_mem_pshape = mem_ptr->get_layout().get_partial_shape();
const auto shape_size = concat_mem_pshape.size();
for (size_t i = 0; i < shape_size; i++) {
if (updated_sliced_layout[i].is_dynamic()) {
updated_sliced_layout[i] = concat_mem_pshape[i];
}
}
GPU_DEBUG_LOG << "output pshape for [" << prim->id() << "] is changed from "
<< sliced_layout.get_partial_shape().to_string()
<< " to " << updated_sliced_layout.to_string() << std::endl;
sliced_layout.set_partial_shape(updated_sliced_layout);
out_mem_ptr = engine.allocate_memory(sliced_layout);
}
// When trip_count is -1, allocate first sliced_mem and allocate sliced memory if additional sliced mem is required
// When num_iterations is -1, allocate first sliced_mem and allocate sliced memory if additional sliced mem is required
if (num_iterations < 0) {
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
sliced_mems.push_back(sliced_mem);
sliced_mems.push_back(out_mem_ptr);
} else {
sliced_mems.reserve(num_iterations);
for (int j=0; j < num_iterations; ++j) {
sliced_mems.push_back(out_mem_ptr);
for (int j=1; j < num_iterations; ++j) {
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
sliced_mems.push_back(sliced_mem);
}

View File

@ -17,6 +17,7 @@
#include "reshape_inst.h"
#include "reorder_inst.h"
#include "eltwise_inst.h"
#include "loop_inst.h"
#include "deconvolution_inst.h"
#include "shape_of_inst.h"
#include "softmax_inst.h"
@ -272,6 +273,13 @@ void primitive_inst::update_shape() {
return;
}
// if input shape is not changed, loop doesn't need to update anything.
// because actual output layout will be calculated after the end of body network execution.
if (_node->is_type<loop>() && !input_shape_changed) {
reset_shape_change();
return;
}
// Do not update shapes in shape_of subraph if shape_of's input shape is not changed
if (_node->is_in_shape_of_subgraph()) {
bool subgraph_input_changed = false;

View File

@ -432,11 +432,11 @@ TEST(loop_gpu, basic_concat_nested_cached) {
test_loop_gpu_basic_concat_nested<float>(true);
}
static void test_loop_gpu_wo_trip_count(bool is_caching_test) {
static void test_loop_gpu_wo_trip_count(ov::PartialShape body_input_layout, bool is_caching_test = false) {
auto& engine = get_test_engine();
auto e_input_layout = cldnn::layout{ { 1, 1, 5, 4 }, data_types::f32, format::bfyx };
auto b_input_layout = cldnn::layout{ { 1, 1, 1, 4}, data_types::f32, format::bfyx };
auto b_input_layout = cldnn::layout{ body_input_layout, data_types::f32, format::bfyx };
auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx };
auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y
@ -547,5 +547,9 @@ static void test_loop_gpu_wo_trip_count(bool is_caching_test) {
}
TEST(loop_gpu, support_dynamic_tensoriterator) {
test_loop_gpu_wo_trip_count(false);
test_loop_gpu_wo_trip_count({ 1, 1, 1, 4 });
}
TEST(loop_gpu, support_loop_w_dynamic_body_input) {
test_loop_gpu_wo_trip_count({ 1, -1, 1, 4 });
}