[GPU] Fix acc issue for LSTMSequence w/ -1 seq_length (#21054)

* [GPU] Fix acc issue for LSTMSequence w/ -1 seq_length
* add output port for multiple outputs of node
* add functional test for lstm_sequence

* Fix CI test failures
This commit is contained in:
Paul Youngsoo Ahn 2023-11-15 03:47:03 +09:00 committed by GitHub
parent a720b43041
commit da2a886477
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 151 additions and 73 deletions

View File

@ -273,10 +273,6 @@ struct loop : public primitive_base<loop> {
protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
std::vector<std::reference_wrapper<const primitive_id>> ret;
ret.push_back(std::ref(num_iteration_id));
if (!trip_count_id.empty()) ret.push_back(std::ref(trip_count_id));
if (!first_execution_condition_id.empty()) ret.push_back(std::ref(first_execution_condition_id));
// add external_id in dependencies if not exist
for (const auto& mapping : input_primitive_maps) {
auto target = std::find_if(input.begin(), input.end(),

View File

@ -83,7 +83,9 @@ void handle_reshape::run(program& p) {
for (const auto& node : p.get_processing_order()) {
if (node->is_type<reshape>()) {
auto& input_node = node->get_dependency(0);
const auto& dep = node->get_dependency_with_port(0);
auto& input_node = *dep.first;
auto& input_port = dep.second;
if (input_node.is_type<reorder>())
continue;
@ -162,8 +164,10 @@ void handle_reshape::run(program& p) {
if (std::find(reorder_node_to_split.begin(), reorder_node_to_split.end(), user) !=
reorder_node_to_split.end()) {
auto new_reshape = std::make_shared<reshape>("reorder:_reshape_split_" + user->id() + "_" + node->id(),
input_node.id(),
cldnn::input_info(input_node.id(), input_port),
output_shape);
GPU_DEBUG_LOG << "reshape_handler: " << new_reshape->id
<< " input_info : " << new_reshape->dependencies().front().to_string() << std::endl;
new_reshape->special_zero = prim->special_zero;
new_reshape->output_partial_shape = prim->output_partial_shape;
new_reshape->output_pattern = prim->output_pattern;
@ -192,9 +196,12 @@ void handle_reshape::run(program& p) {
auto format = cldnn::format::get_default_format(dims);
auto reshape_input = std::make_shared<reorder>(
"reorder:_reshape_input_" + reorder_node->id() + "_" + reorder_reshape_node->id(),
input_node.id(),
cldnn::input_info(input_node.id(), input_port),
format,
reshape_in_layout.data_type);
GPU_DEBUG_LOG << "reshape_handler: " << reshape_input->id
<< " input_info : " << reshape_input->dependencies().front().to_string() << std::endl;
auto& reshape_input_node = p.get_or_create(reshape_input);
p.add_intermediate(reshape_input_node,
*reorder_reshape_node,
@ -214,9 +221,11 @@ void handle_reshape::run(program& p) {
// in reshape stage we assume user provides the input vector in bfyx
if (!reshape_layout.compatible(target_layout)) {
auto reshape_input = std::make_shared<reorder>("reorder:_reshape_input_" + node->id(),
input_node.id(),
cldnn::input_info(input_node.id(), input_port),
target_format,
reshape_layout.data_type);
GPU_DEBUG_LOG << "reshape_handler: " << reshape_input->id
<< " input_info : " << reshape_input->dependencies().front().to_string() << std::endl;
auto& reshape_input_node = p.get_or_create(reshape_input);
p.add_intermediate(reshape_input_node, *node, 0, reshape_input_node.get_dependencies().empty());
reshape_input_node.recalc_output_layout();

View File

@ -315,7 +315,8 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
p.replace(prev_node, new_node);
// Insert bias_node into 3-rd position in dependencies vector to get correct order in case of asymmetric quantization
// which means that node can have > 2 dependencies even without bias
new_node.dependencies.insert(new_node.dependencies.begin() + 2, {&bias_node, 0});
auto port_idx = new_node.get_port_from_deps(bias_node.id());
new_node.dependencies.insert(new_node.dependencies.begin() + 2, {&bias_node, port_idx});
bias_node.users.push_back(&new_node);
// Remove all edges connected with peer node

View File

@ -350,8 +350,11 @@ void remove_redundant_reorders::run(program& p) {
!user->has_fused_primitives()) {
auto l1 = node->get_output_layout();
auto l2 = user->get_output_layout();
// in multiple outputs, remove redundant reorder is only allowed for same output port idx
auto l1_port_idx = node->get_dependency_with_port(0).second;
auto l2_port_idx = user->get_dependency_with_port(0).second;
if (l1.identical(l2))
if (l1.identical(l2) && (l1_port_idx == l2_port_idx))
r_nodes_to_remove.push_back(user);
}
}

View File

@ -118,18 +118,35 @@ struct loop_impl : typed_primitive_impl<loop> {
auto body_network = instance.get_body_network();
int64_t current_iteration_idx = 0;
auto ev = stream.create_user_event(false);
const auto is_dynamic = instance.is_dynamic();
if (is_dynamic) {
instance.update_shape();
if (instance.shape_changed()) {
instance.preproc_memories_done = false;
instance.reset_memory();
}
}
body_network->set_shape_predictor(outer_network.get_shape_predictor());
OPENVINO_ASSERT(!primitive->num_iteration_id.empty(), "loop operation should have num_iteration_id");
// shortcut of execution_condition memory in body network
memory::ptr body_execution_condition_mem = nullptr;
if (!primitive->body_execution_condition_id.empty()) {
body_execution_condition_mem = body_network->get_primitive(primitive->body_execution_condition_id)->output_memory_ptr();
}
// shortcut of current_iteration memory in body network
if (!primitive->body_current_iteration_id.empty()) {
memory::ptr body_current_iteration_mem = body_network->get_primitive(primitive->body_current_iteration_id)->output_memory_ptr();
write_scalar_value(body_current_iteration_mem, body_network->get_stream(), 0);
}
auto num_iterations = instance.get_num_iterations();
GPU_DEBUG_LOG << "num_iterations : " << num_iterations << std::endl;
//////////////////////////////////////////
// memory pointers for outer network
//////////////////////////////////////////
// read trip_count from outer network
int64_t trip_count = -1;
if (!primitive->trip_count_id.empty()) {
@ -166,30 +183,6 @@ struct loop_impl : typed_primitive_impl<loop> {
return ev;
}
//////////////////////////////////////////
// memory pointers for body network
//////////////////////////////////////////
// shortcut of execution_condition memory in body network
memory::ptr body_execution_condition_mem = nullptr;
if (!primitive->body_execution_condition_id.empty()) {
body_execution_condition_mem = body_network->get_primitive(primitive->body_execution_condition_id)->output_memory_ptr();
}
// shortcut of current_iteration memory in body network
if (!primitive->body_current_iteration_id.empty()) {
memory::ptr body_current_iteration_mem = body_network->get_primitive(primitive->body_current_iteration_id)->output_memory_ptr();
write_scalar_value(body_current_iteration_mem, body_network->get_stream(), 0);
}
const auto is_dynamic = instance.is_dynamic();
if (is_dynamic) {
instance.update_shape();
if (instance.shape_changed()) {
instance.preproc_memories_done = false;
instance.reset_memory();
}
}
if (!instance.preproc_memories_done) {
instance.preprocess_output_memory(num_iterations);
instance.preprocess_input_memory(num_iterations);

View File

@ -446,6 +446,18 @@ public:
void set_preferred_input_fmt(size_t idx, format::type type);
void set_preferred_output_fmt(size_t idx, format::type type);
int32_t get_port_from_deps(primitive_id target_id) const {
auto deps = get_primitive()->dependencies();
auto iter = std::find_if(deps.begin(), deps.end(), [&](input_info& info) {
return target_id == info.pid;
});
if (iter != deps.end()) {
return iter->idx;
} else {
return 0;
}
}
protected:
size_t unique_id = 0;
static thread_local size_t cur_id;

View File

@ -655,17 +655,26 @@ void loop_inst::postprocess_output_memory(bool is_dynamic, int64_t current_itera
OPENVINO_ASSERT(internal_mem != nullptr, "internal_mem should not be nullptr");
if (!output_allocated) {
external_outputs[external_id.idx] = internal_mem;
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ] "
<< "Set internal memory(" << internal_mem << ") to external output because external output memory is nullptr." << std::endl;
} else {
auto external_mem = _outputs[external_id.idx];
if (external_mem != internal_mem) {
if (external_mem->get_layout() != internal_mem->get_layout()) {
external_outputs[external_id.idx] = internal_mem;
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ] "
<< "Set internal memory(" << internal_mem
<< ") to external output for different layout between external_mem and internal_mem." << std::endl;
} else {
external_mem->copy_from(get_network().get_stream(), *internal_mem);
external_outputs[external_id.idx] = external_mem;
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ] "
<< "Copy internal memory data to external memory data." << std::endl;
}
} else {
external_outputs[external_id.idx] = external_mem;
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ] "
<< " Have same memory pointer." << std::endl;
}
}
} else {
@ -680,9 +689,17 @@ void loop_inst::postprocess_output_memory(bool is_dynamic, int64_t current_itera
});
if (iter != concatenated_output_mem_mappings.end()) {
(*iter)->update_concatenated_mem(concat_mem);
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ]"
<< " Update concat_mem" << std::endl;
}
GPU_DEBUG_IF(iter == concatenated_output_mem_mappings.end()) {
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ]"
<< " Can't find concatenated_memory_mapping" << std::endl;
}
} else {
external_outputs[external_id.idx] = _outputs[external_id.idx];
GPU_DEBUG_LOG << "[Internal: " << internal_id.to_string() << ", External: " << external_id.to_string() << " ]"
<< " No update concat_mem" << std::endl;
}
}
}
@ -696,6 +713,7 @@ void loop_inst::postprocess_output_memory(bool is_dynamic, int64_t current_itera
}
void loop_inst::reset_memory() {
GPU_DEBUG_LOG << "Reset memory" << std::endl;
backedge_memory_mappings.clear();
concatenated_input_mem_mappings.clear();
for (auto concat_mem_map : concatenated_output_mem_mappings) {
@ -882,7 +900,9 @@ int64_t loop_inst::get_num_iterations() {
is_default_num_iter = false;
num_iterations = current_num_iterations;
}
OPENVINO_ASSERT(num_iterations == current_num_iterations,
// only check num_terations when shape is not changed.
if (preproc_memories_done)
OPENVINO_ASSERT(num_iterations == current_num_iterations,
"iteration num shuld be same between ", num_iterations, " and ", current_num_iterations);
}
return num_iterations;
@ -928,6 +948,7 @@ std::vector<event::ptr> loop_inst::handle_buffers_for_next_iteration(const loop_
if (mapping.from_mem != nullptr) {
auto ev = mapping.from_mem->copy_from(body_network->get_stream(), *(mapping.initial_mem));
if (ev) event_vec = {ev};
GPU_DEBUG_LOG << iter << ") Copy data from inintal_mem(" << mapping.initial_mem << ")" << std::endl;
}
} else {
// In dynamic model, output memory is not defined before execution.
@ -936,6 +957,7 @@ std::vector<event::ptr> loop_inst::handle_buffers_for_next_iteration(const loop_
mapping.from_mem = mapping.from_primitive->output_memory_ptr();
OPENVINO_ASSERT(mapping.from_mem != nullptr, "from_mem should not be null");
set_memory_in_body_network(body_network, mapping.to_primitive, mapping.from_mem);
GPU_DEBUG_LOG << iter << ") Set memory from from_mem(" << mapping.from_mem << ") to " << mapping.to_primitive->id() << ")" << std::endl;
}
}
} else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE) {

View File

@ -38,7 +38,8 @@ void lstm_dynamic_timeloop_node::reverse_optional_outputs_connections() {
}));
mutable_data_node.users.push_back(this);
users.remove(&mutable_data_node);
dependencies.insert(dependencies.begin() + index_to_insert, {&mutable_data_node, 0});
auto port_idx = get_port_from_deps(mutable_data_node.id());
dependencies.insert(dependencies.begin() + index_to_insert, {&mutable_data_node, port_idx});
// fix inputs/outputs
if (mutable_data_node.get_dependencies().empty()) {
myprog.get_inputs().push_back(&mutable_data_node);

View File

@ -1011,6 +1011,17 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
allocate_memory = false;
}
_mem_allocated = allocate_memory;
if (!_mem_allocated && (node.is_dynamic() && _outputs_memory_count > 1)) {
auto avaiable_allocate_memory = [&](std::vector<cldnn::layout>& layouts) -> bool {
for (auto& l : layouts) {
if (l.is_static())
return true;
}
return false;
};
allocate_memory = _mem_allocated = avaiable_allocate_memory(_impl_params->output_layouts);
}
if (allocate_memory) {
// In case when output is mutable_data primitive, and other users dependencies are only used for
// synchronization, The output memory of such primitive will be fused with mutable_data
@ -1377,23 +1388,28 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem, bool runtime_alloc) {
std::vector<memory::ptr> outputs;
auto impl_params = updated_params != nullptr ? *updated_params : *_impl_params;
auto& out_layouts = impl_params.output_layouts;
for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
auto impl_params = updated_params != nullptr ? *updated_params : *_impl_params;
auto current_memory_ptr = _outputs.size() > i ? output_memory_ptr(i).get() : nullptr;
auto is_output = is_output_buffer(this, runtime_alloc);
if (out_layouts[i].is_dynamic() && !out_layouts[i].has_upper_bound()) {
outputs.push_back(memory::ptr());
} else {
auto current_memory_ptr = _outputs.size() > i ? output_memory_ptr(i).get() : nullptr;
auto is_output = is_output_buffer(this, runtime_alloc);
outputs.push_back(allocate_output(_network.get_engine(),
_network.get_memory_pool(),
*_node,
impl_params,
_runtime_memory_dependencies,
get_network_id(),
_network.is_internal(),
i,
reset_mem,
is_output,
current_memory_ptr,
runtime_alloc));
outputs.push_back(allocate_output(_network.get_engine(),
_network.get_memory_pool(),
*_node,
impl_params,
_runtime_memory_dependencies,
get_network_id(),
_network.is_internal(),
i,
reset_mem,
is_output,
current_memory_ptr,
runtime_alloc));
}
}
return outputs;
}

View File

@ -850,7 +850,8 @@ void program::add_intermediate(program_node& node,
void program::add_connection(program_node& prev, program_node& next) {
prev.users.push_back(&next);
next.dependencies.push_back({&prev, 0});
auto port_idx = next.get_port_from_deps(prev.id());
next.dependencies.push_back({&prev, port_idx});
}
void program::remove_connection(program_node& prev, program_node& next) {
@ -1131,7 +1132,9 @@ void program::fuse_nodes(program_node &fused_node,
continue;
}
}
fused_node.dependencies.push_back({&dep, 0});
auto port_idx = fused_node.get_port_from_deps(dep.id());
fused_node.dependencies.push_back({&dep, port_idx});
local_desc.deps.emplace_back(dep.id(), deps_idx++);
dep.users.push_back(&fused_node);
}

View File

@ -368,7 +368,7 @@ bool program_node::recalc_output_layouts(bool invalidate_users_if_changed) {
bool program_node::is_dynamic() const {
for (const auto& input : get_dependencies()) {
if (input.first->is_dynamic_output_layout())
if (input.first->is_dynamic_output_layout(input.second))
return true;
}
@ -381,7 +381,7 @@ bool program_node::is_dynamic() const {
bool program_node::is_dynamic() {
for (auto& input : get_dependencies()) {
if (input.first->is_dynamic_output_layout())
if (input.first->is_dynamic_output_layout(input.second))
return true;
}

View File

@ -66,19 +66,23 @@ static void SetLoopInputOutputMap(ProgramBuilder& p,
auto& body_input = body_inputs.at(loop_input_desc->m_body_parameter_index);
cldnn::primitive_id internal_id = layer_type_name_ID(body_input);
GPU_DEBUG_LOG << "loop_input_descs[" << layerName << "] = {m_input_index:" << loop_input_desc->m_input_index << "(external_id: "
<< external_id << "), m_body_parameter_index:" << loop_input_desc->m_body_parameter_index
<< "(internal_id: " << internal_id << ")}" << std::endl;
// set input mapping
if (const auto& sliceInfo =
std::dynamic_pointer_cast<ov::op::util::MultiSubGraphOp::SliceInputDescription>(loop_input_desc)) {
// sliced input
input_primitive_maps.emplace_back(external_id, internal_id, sliceInfo->m_axis,
sliceInfo->m_start, sliceInfo->m_end, sliceInfo->m_stride);
GPU_DEBUG_LOG << "loop_input_descs[" << layerName << "][SliceInputDescription] = {m_input_index:"
<< loop_input_desc->m_input_index << "(external_id: "
<< external_id << "), m_body_parameter_index:" << loop_input_desc->m_body_parameter_index
<< "(internal_id: " << internal_id << ")}" << std::endl;
} else {
// input without slicing
input_primitive_maps.emplace_back(external_id, internal_id);
GPU_DEBUG_LOG << "loop_input_descs[" << layerName << "][InputDescription] = {m_input_index:"
<< loop_input_desc->m_input_index << "(external_id: "
<< external_id << "), m_body_parameter_index:" << loop_input_desc->m_body_parameter_index
<< "(internal_id: " << internal_id << ")}" << std::endl;
}
// set back edges
@ -92,6 +96,7 @@ static void SetLoopInputOutputMap(ProgramBuilder& p,
cldnn::primitive_id from_id = layer_type_name_ID(from);
back_edges_maps.emplace_back(from_id, to_id);
GPU_DEBUG_LOG << "back_edge = {" << from_id << " => " << to_id << "}" << std::endl;
}
}
@ -279,7 +284,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
// get body program from ov::Model
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
auto body_program = prog.get_compiled_program();
GPU_DEBUG_LOG << "* trip_count_id : " << trip_count_id << std::endl;

View File

@ -169,7 +169,7 @@ static std::shared_ptr<ov::Model> makeLSTMSequence(ov::element::Type_t ngPRC, ov
enum class LSTMType {
LSTMCell = 0,
LSTMSequence = 1 // will be updated at next step.
LSTMSequence = 1
};
using DynamicTensorIteratorParams = typename std::tuple<
@ -288,6 +288,10 @@ TEST_P(DynamicTensorIteratorTest, CompareWithRefs) {
run();
}
std::vector<LSTMType> lstm_types = {
LSTMType::LSTMCell, LSTMType::LSTMSequence
};
std::vector<InputShape> input_shapes = {
InputShape(ov::PartialShape({1, -1, 512}), {{1, 30, 512}, {1, 10, 512}, {1, 5, 512}})
};
@ -319,4 +323,15 @@ INSTANTIATE_TEST_SUITE_P(smoke_DynamicTensorIterator_LSTMCell, DynamicTensorIter
/* data_prc */ testing::ValuesIn(net_precision),
/* configuration */ testing::Values<ov::AnyMap>(net_configuration)),
DynamicTensorIteratorTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_DynamicTensorIterator_LSTMSequence, DynamicTensorIteratorTest,
testing::Combine(
/* lstm_type */ testing::ValuesIn({LSTMType::LSTMSequence}),
/* data_shape */ testing::ValuesIn(input_shapes),
/* hidden_size */ testing::ValuesIn(hidden_sizes),
/* direction */ testing::ValuesIn(reccurent_sequence_direction),
/* device */ testing::Values<std::string>(ov::test::utils::DEVICE_GPU),
/* data_prc */ testing::ValuesIn(net_precision),
/* configuration */ testing::Values<ov::AnyMap>(net_configuration)),
DynamicTensorIteratorTest::getTestCaseName);
} // namespace GPULayerTestsDefinitions

View File

@ -111,7 +111,8 @@ TEST_P(permute_eltwise_loop, basic) {
data("trip_count", trip_count_mem),
data("initial_condition", initial_condition_mem),
mutable_data("num_iteration", num_iteration_mem),
loop("loop", { input_info("num_iteration"), input_info("eltwise"), input_info("loop_eltwise_init_values") }, body_program,
loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"),
input_info("eltwise"), input_info("loop_eltwise_init_values") }, body_program,
"trip_count", "initial_condition", "num_iteration",
input_primitive_maps, output_primitive_maps, back_edges, p.loop_trip_count),
reorder("output", input_info("loop"), format::bfyx, p.default_type)

View File

@ -99,7 +99,7 @@ void test_loop_gpu_basic_no_concat(bool is_caching_test)
input_layout("trip_count", trip_count_mem->get_layout()),
input_layout("initial_condition", initial_condition_mem->get_layout()),
mutable_data("num_iteration", num_iteration_mem),
loop("loop", { input_info("num_iteration"), input_info("input") }, body_program,
loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"), input_info("input") }, body_program,
"trip_count", "initial_condition", "num_iteration",
input_primitive_maps, output_primitive_maps, back_edges, 8)
);
@ -201,7 +201,7 @@ void test_loop_gpu_basic_concat(bool is_caching_test)
input_layout("trip_count", trip_count_mem->get_layout()),
input_layout("initial_condition", initial_condition_mem->get_layout()),
mutable_data("num_iteration", num_iteration_mem),
loop("loop", { input_info("num_iteration"), input_info("input") }, body_program,
loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"), input_info("input") }, body_program,
"trip_count", "initial_condition", "num_iteration",
input_primitive_maps, output_primitive_maps, back_edges, trip_count)
);
@ -316,7 +316,7 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test)
input_layout("trip_count", inner_trip_count_mem->get_layout()),
input_layout("initial_condition", inner_initial_condition_mem->get_layout()),
mutable_data("inner_num_iteration", inner_num_iteration_mem),
loop("inner_loop", { input_info("inner_num_iteration"), input_info("inner_input"), input_info("trip_count"), input_info("initial_condition") },
loop("inner_loop", { input_info("inner_num_iteration"), input_info("trip_count"), input_info("initial_condition"), input_info("inner_input") },
inner_body_program, "trip_count", "initial_condition", "inner_num_iteration",
inner_input_primitive_maps, inner_output_primitive_maps, inner_back_edges, inner_trip_count)
);
@ -342,9 +342,10 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test)
mutable_data("num_iteration", num_iteration_mem),
input_layout("inner_trip_count", inner_trip_count_mem->get_layout()),
input_layout("inner_initial_condition", inner_initial_condition_mem->get_layout()),
loop("loop", { input_info("num_iteration"), input_info("input"), input_info("inner_trip_count"), input_info("inner_initial_condition") },
outer_body_program, "trip_count", "initial_condition", "num_iteration",
outer_input_primitive_maps, outer_output_primitive_maps, outer_back_edges, outer_trip_count)
loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"),
input_info("input"), input_info("inner_trip_count"), input_info("inner_initial_condition") },
outer_body_program, "trip_count", "initial_condition", "num_iteration",
outer_input_primitive_maps, outer_output_primitive_maps, outer_back_edges, outer_trip_count)
);
/////////////////////////////////
@ -498,7 +499,7 @@ static void test_loop_gpu_wo_trip_count(bool is_caching_test) {
input_layout("input", e_input_layout),
input_layout(initial_condition_id, e_initial_condition_mem->get_layout()),
mutable_data(actual_iteration_count_id, e_num_iteration_mem),
loop("loop", { input_info(actual_iteration_count_id), input_info("input") }, body_program,
loop("loop", { input_info(actual_iteration_count_id), input_info(initial_condition_id), input_info("input") }, body_program,
trip_count_id, initial_condition_id, actual_iteration_count_id,
input_primitive_maps, output_primitive_maps, back_edges,
num_iterations, body_current_iteration_id, body_execution_condition_id, 2),