Support dynamic tensor_iterator (#20869)
* [GPU] Support dynamic tensoriterator with -1 num_iteration - remove redundant codes * [GPU] Refactoring methods for pre_process / post_process for body_network * Add unit test for dynamic tensoriterator wo trip_count_id * Follow-up code review * Set inner network in loading of model cache * Fix legacy loop unit tests
This commit is contained in:
committed by
GitHub
parent
c6ca7865fb
commit
c42a88a190
@@ -114,6 +114,29 @@ struct loop : public primitive_base<loop> {
|
||||
ib >> end;
|
||||
ib >> stride;
|
||||
}
|
||||
|
||||
std::string to_string() const {
|
||||
std::stringstream ss;
|
||||
ss << "io_primitive_map " << std::endl;
|
||||
ss << "* external_id : " << external_id.to_string() << std::endl;
|
||||
ss << "* internal_id : " << internal_id.to_string() << std::endl;
|
||||
ss << "* axis : " << axis << std::endl;
|
||||
ss << "* start : " << start << std::endl;
|
||||
ss << "* end : " << end << std::endl;
|
||||
ss << "* stride : " << stride << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string to_short_string() const {
|
||||
std::stringstream ss;
|
||||
ss << "io_primitive_map[e:" << external_id.to_string();
|
||||
ss << "," << internal_id.to_string();
|
||||
ss << "," << axis;
|
||||
ss << "," << start;
|
||||
ss << "," << end;
|
||||
ss << "," << stride << "]";
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
|
||||
struct backedge_mapping {
|
||||
|
||||
@@ -37,6 +37,7 @@ target_link_libraries(${TARGET_NAME} PUBLIC OpenCL::OpenCL openvino::shape_infer
|
||||
target_link_libraries(${TARGET_NAME} PRIVATE openvino_intel_gpu_kernels
|
||||
openvino_intel_gpu_runtime
|
||||
openvino::itt
|
||||
openvino::reference
|
||||
openvino::runtime::dev
|
||||
openvino::runtime)
|
||||
|
||||
|
||||
@@ -110,91 +110,12 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
_back_edges = node.get_back_edges();
|
||||
}
|
||||
|
||||
void set_memory_in_body_network(cldnn::network::ptr body_network,
|
||||
const std::shared_ptr<cldnn::primitive_inst>& inst, memory::ptr mem) const {
|
||||
if (inst->is_input()) {
|
||||
body_network->set_input_data(inst->id(), mem);
|
||||
} else if (inst->is_output()) {
|
||||
body_network->set_output_memory(inst->id(), mem);
|
||||
} else {
|
||||
inst->set_output_memory(mem, false);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<event::ptr> handle_buffers_for_next_iteration(const loop_inst::backedge_memory_mapping& mapping,
|
||||
network::ptr body_network, int64_t iter, bool is_dynamic) const {
|
||||
std::vector<event::ptr> event_vec;
|
||||
OPENVINO_ASSERT(iter >= 0, "iteration should not be negative : ", iter);
|
||||
if (mapping.type == loop_inst::backedge_memory_mapping::CONCAT_OUTPUT) {
|
||||
if (iter == 0) {
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, mapping.initial_mem);
|
||||
} else if (iter > 0) {
|
||||
if (is_dynamic) {
|
||||
auto from_id = mapping.from_primitive->id();
|
||||
if (body_network->has_event(from_id)) {
|
||||
auto ev = body_network->get_primitive_event(from_id);
|
||||
if (ev) ev->wait();
|
||||
}
|
||||
// In dynamic model, just copy data from inner body output to inner body input in back_edges.
|
||||
memory::ptr mem1 = mapping.to_primitive->output_memory_ptr();
|
||||
memory::ptr mem2 = mapping.from_primitive->output_memory_ptr();
|
||||
auto ev = mem1->copy_from(body_network->get_stream(), *(mem2));
|
||||
if (ev) event_vec = {ev};
|
||||
} else {
|
||||
auto mem = mapping.concat_mem_mapping->get_sliced_mems().at(iter - 1);
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, mem);
|
||||
}
|
||||
}
|
||||
} else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE_SHARED) {
|
||||
if (iter == 0) {
|
||||
if (mapping.from_mem != nullptr) {
|
||||
auto ev = mapping.from_mem->copy_from(body_network->get_stream(), *(mapping.initial_mem));
|
||||
if (ev) event_vec = {ev};
|
||||
}
|
||||
} else {
|
||||
// In dynamic model, output memory is not defined before execution.
|
||||
// After body network execution, replace input memory from initial_mem(external input memory) to output memory.
|
||||
if (mapping.from_mem == nullptr) {
|
||||
mapping.from_mem = mapping.from_primitive->output_memory_ptr();
|
||||
OPENVINO_ASSERT(mapping.from_mem != nullptr, "from_mem should not be null");
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, mapping.from_mem);
|
||||
}
|
||||
}
|
||||
} else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE) {
|
||||
memory::ptr mem1 = mapping.to_primitive->output_memory_ptr();
|
||||
if (iter == 0) {
|
||||
auto ev = mem1->copy_from(body_network->get_stream(), *(mapping.initial_mem));
|
||||
if (ev) event_vec = {ev};
|
||||
} else {
|
||||
if (is_dynamic) {
|
||||
// In dynamic model, do not set memory buffer between input and output in inner body network.
|
||||
// Just copy data from input buffer memory to output buffer memory.
|
||||
auto from_id = mapping.from_primitive->id();
|
||||
if (body_network->has_event(from_id)) {
|
||||
auto ev = body_network->get_primitive_event(from_id);
|
||||
if (ev) ev->wait();
|
||||
}
|
||||
memory::ptr mem2 = mapping.from_primitive->output_memory_ptr();
|
||||
auto ev = mem1->copy_from(body_network->get_stream(), *(mem2));
|
||||
if (ev) event_vec = {ev};
|
||||
} else {
|
||||
// In static model, swap memory buffer between output and input in inner body network
|
||||
memory::ptr mem2 = mapping.from_primitive->output_memory_ptr();
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, std::move(mem2));
|
||||
set_memory_in_body_network(body_network, mapping.from_primitive, std::move(mem1));
|
||||
}
|
||||
}
|
||||
}
|
||||
return event_vec;
|
||||
}
|
||||
|
||||
event::ptr execute_impl(const std::vector<event::ptr>& events, loop_inst& instance) override {
|
||||
const auto& impl_params = instance.get_impl_params();
|
||||
const auto& primitive = impl_params->typed_desc<loop>();
|
||||
auto& outer_network = instance.get_network();
|
||||
auto& stream = outer_network.get_stream();
|
||||
|
||||
const auto max_num_iterations = primitive->max_num_iterations;
|
||||
auto body_network = instance.get_body_network();
|
||||
int64_t current_iteration_idx = 0;
|
||||
|
||||
@@ -202,6 +123,9 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
|
||||
OPENVINO_ASSERT(!primitive->num_iteration_id.empty(), "loop operation should have num_iteration_id");
|
||||
|
||||
auto num_iterations = instance.get_num_iterations();
|
||||
GPU_DEBUG_LOG << "num_iterations : " << num_iterations << std::endl;
|
||||
|
||||
//////////////////////////////////////////
|
||||
// memory pointers for outer network
|
||||
//////////////////////////////////////////
|
||||
@@ -211,8 +135,16 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
memory::ptr trip_count_mem = outer_network.get_primitive(primitive->trip_count_id)->output_memory_ptr();
|
||||
trip_count = read_scalar_value(std::move(trip_count_mem), stream);
|
||||
} else {
|
||||
trip_count = max_num_iterations;
|
||||
OPENVINO_ASSERT(!primitive->body_execution_condition_id.empty()
|
||||
|| num_iterations > 0 || primitive->max_num_iterations > 0,
|
||||
"num_iterations should be positive when trip_count_id is not existed");
|
||||
// If trip_count_id is not existed, the original ngraph operation is TensorIterator.
|
||||
// If num_iterations is negative, it means that TensorIterator has no concat input / output memory.
|
||||
// When it has no body_exeuction_conditio_id and num_iterations and primtive->max_num_iteartion,
|
||||
// TensorIterator has no ending condition. So it cannot terminate inner body execution loop.
|
||||
trip_count = num_iterations > 0 ? num_iterations : primitive->max_num_iterations;
|
||||
}
|
||||
GPU_DEBUG_LOG << "trip_count : " << trip_count << std::endl;
|
||||
|
||||
// read initial execution condition from outer network
|
||||
int64_t execution_condition = 1;
|
||||
@@ -220,6 +152,7 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
memory::ptr first_execution_condition_mem = outer_network.get_primitive(primitive->first_execution_condition_id)->output_memory_ptr();
|
||||
execution_condition = read_scalar_value(first_execution_condition_mem, stream);
|
||||
}
|
||||
GPU_DEBUG_LOG << "execution_condition: " << execution_condition << std::endl;
|
||||
|
||||
// When execution_condition is false or trip_count is zero, return execute_impl without any body_network execution.
|
||||
if (!execution_condition || trip_count == 0) {
|
||||
@@ -257,17 +190,16 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
}
|
||||
|
||||
if (!instance.preproc_memories_done) {
|
||||
instance.preprocess_output_memory(trip_count);
|
||||
instance.preprocess_input_memory(trip_count);
|
||||
instance.preprocess_output_memory(num_iterations);
|
||||
instance.preprocess_input_memory(num_iterations);
|
||||
instance.preprocess_backedge_memory();
|
||||
instance.preproc_memories_done = true;
|
||||
}
|
||||
|
||||
const auto& concatenated_input_mem_mappings = instance.concatenated_input_mem_mappings;
|
||||
const auto& concatenated_output_mem_mappings = instance.concatenated_output_mem_mappings;
|
||||
const auto& backedge_memory_mappings = instance.backedge_memory_mappings;
|
||||
|
||||
// If there are concatenated_output_mem_mappings or backedge_memory_mappings we need to wait for
|
||||
// If there are concatenated_input_mem_mappings or backedge_memory_mappings we need to wait for
|
||||
// previous tasks before accessing memory in get_sliced_mem() and setup_iteration() functions
|
||||
if (!concatenated_input_mem_mappings.empty() || !backedge_memory_mappings.empty()) {
|
||||
for (auto& e : events) {
|
||||
@@ -278,36 +210,18 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
// Set sliced input data
|
||||
for (size_t i = 0; i < concatenated_input_mem_mappings.size(); ++i) {
|
||||
const auto& concatenated_input = concatenated_input_mem_mappings.at(i);
|
||||
concatenated_input->slice_mem(num_iterations);
|
||||
memory::ptr mem = concatenated_input->get_sliced_mem(0);
|
||||
OPENVINO_ASSERT(mem != nullptr, instance.id(), "sliced input memory of loop is not allocated properly");
|
||||
body_network->set_input_data(concatenated_input->sliced_data_prim->id(), mem);
|
||||
body_network->set_input_data(concatenated_input->get_sliced_data_prim_id(), mem);
|
||||
}
|
||||
|
||||
std::vector<event::ptr> all_events;
|
||||
std::vector<event::ptr> loop_carried_dep(events.begin(), events.end());
|
||||
while (((trip_count <= 0) || (current_iteration_idx < trip_count)) && execution_condition) {
|
||||
// Copy & Set sliced input memory
|
||||
for (size_t i = 0; i < concatenated_input_mem_mappings.size(); ++i) {
|
||||
const auto& concatenated_input = concatenated_input_mem_mappings.at(i);
|
||||
memory::ptr mem = concatenated_input->get_sliced_mem(current_iteration_idx);
|
||||
OPENVINO_ASSERT(mem != nullptr, instance.id(), " sliced input memory of loop is not allocated properly");
|
||||
concatenated_input->sliced_data_prim->set_output_memory(mem);
|
||||
}
|
||||
|
||||
// Set backedges and output memory
|
||||
for (auto& backedge_memory_mapping : backedge_memory_mappings) {
|
||||
auto event_vec = handle_buffers_for_next_iteration(backedge_memory_mapping, body_network, current_iteration_idx, is_dynamic);
|
||||
for (auto ev : event_vec) {
|
||||
loop_carried_dep.push_back(ev);
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_dynamic) {
|
||||
// Set sliced output memory for static shape model
|
||||
// because body network generate output memory during the body network execution in dynamic model
|
||||
for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) {
|
||||
concat_output_mem_mapping->setup_sliced_output_memory(current_iteration_idx);
|
||||
}
|
||||
while (((trip_count < 0) || (current_iteration_idx < trip_count)) && execution_condition) {
|
||||
auto prev_events = instance.preprocess_memory_for_body_network(current_iteration_idx);
|
||||
for (auto& ev : prev_events) {
|
||||
loop_carried_dep.push_back(ev);
|
||||
}
|
||||
|
||||
// execute body network
|
||||
@@ -335,22 +249,10 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
// After execution of body network, sliced_data_prim will has output memory buffer
|
||||
// current memory buffer move to sliced_mems and new memory buffer will be allocated in sliced_data_prim
|
||||
if (is_dynamic) {
|
||||
for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) {
|
||||
auto sliced_data_prim = concat_output_mem_mapping->sliced_data_prim;
|
||||
auto output_mem_ptr = sliced_data_prim->output_memory_ptr();
|
||||
|
||||
auto sliced_id = sliced_data_prim->id();
|
||||
if (body_network->has_event(sliced_id)) {
|
||||
auto ev = body_network->get_primitive_event(sliced_id);
|
||||
if (ev) ev->wait();
|
||||
}
|
||||
memory::ptr new_sliced_mem = concat_output_mem_mapping->get_or_create_sliced_mem(current_iteration_idx,
|
||||
output_mem_ptr->get_layout());
|
||||
auto ev = new_sliced_mem->copy_from(body_network->get_stream(), *output_mem_ptr);
|
||||
if (ev) {
|
||||
loop_carried_dep.push_back(ev);
|
||||
all_events.push_back(ev);
|
||||
}
|
||||
auto post_events = instance.postprocess_memory_for_body_network(current_iteration_idx);
|
||||
for (auto& ev : post_events) {
|
||||
loop_carried_dep.push_back(ev);
|
||||
all_events.push_back(ev);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -364,7 +266,7 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
execution_condition = read_scalar_value(body_execution_condition_mem, body_network->get_stream());
|
||||
}
|
||||
GPU_DEBUG_IF(!execution_condition) {
|
||||
GPU_DEBUG_LOG << "body_exec_condition is false at "<< current_iteration_idx << " iterations" << std::endl;
|
||||
GPU_DEBUG_LOG << "body_exec_condition is false at "<< current_iteration_idx << " iteration idx" << std::endl;
|
||||
}
|
||||
|
||||
current_iteration_idx++;
|
||||
@@ -378,12 +280,12 @@ struct loop_impl : typed_primitive_impl<loop> {
|
||||
// update num_iterations (actual number of iterations)
|
||||
memory::ptr num_actual_iterations_mem = outer_network.get_primitive(primitive->num_iteration_id)->output_memory_ptr();
|
||||
write_scalar_value(num_actual_iterations_mem, stream, current_iteration_idx);
|
||||
GPU_DEBUG_LOG << "current_iteration(" << primitive->num_iteration_id << ", "
|
||||
GPU_DEBUG_LOG << "current_iteration_idx(" << primitive->num_iteration_id << ", "
|
||||
<< num_actual_iterations_mem << ") : " << current_iteration_idx << std::endl;
|
||||
|
||||
if (is_dynamic)
|
||||
instance.update_output_layout();
|
||||
instance.postprocess_output_memory(is_dynamic);
|
||||
instance.postprocess_output_memory(is_dynamic, current_iteration_idx);
|
||||
|
||||
ev->set();
|
||||
return ev;
|
||||
|
||||
@@ -107,137 +107,50 @@ public:
|
||||
struct concatenated_memory_mapping {
|
||||
using ptr = std::shared_ptr<concatenated_memory_mapping>;
|
||||
using cptr = std::shared_ptr<const concatenated_memory_mapping>;
|
||||
concatenated_memory_mapping(int64_t axis,
|
||||
memory::ptr concatenated_mem,
|
||||
std::vector<memory::ptr> sliced_mems, // To change shared ptr vector
|
||||
concatenated_memory_mapping(memory::ptr concatenated_mem,
|
||||
std::vector<memory::ptr> sliced_mems,
|
||||
stream& stream,
|
||||
engine& engine,
|
||||
int64_t iteration_elements = 0,
|
||||
int64_t stride = 0,
|
||||
int64_t initial_offset = 0) :
|
||||
axis(axis),
|
||||
std::shared_ptr<primitive_inst> concat_data_prim,
|
||||
std::shared_ptr<primitive_inst> sliced_data_prim,
|
||||
const cldnn::loop::io_primitive_map& io_prim_map) :
|
||||
concatenated_mem(concatenated_mem),
|
||||
sliced_mems(sliced_mems),
|
||||
stream(stream),
|
||||
engine(engine),
|
||||
iteration_elements(iteration_elements),
|
||||
stride(stride),
|
||||
initial_offset(initial_offset) {
|
||||
calculate_concatenated_mem();
|
||||
}
|
||||
concat_data_prim(std::move(concat_data_prim)),
|
||||
sliced_data_prim(std::move(sliced_data_prim)),
|
||||
io_prim_map(io_prim_map) {}
|
||||
|
||||
concatenated_memory_mapping(const concatenated_memory_mapping& o) :
|
||||
axis(o.axis),
|
||||
concat_data_prim(o.concat_data_prim),
|
||||
sliced_data_prim(o.sliced_data_prim),
|
||||
|
||||
concatenated_mem(o.concatenated_mem),
|
||||
sliced_mems(o.sliced_mems),
|
||||
stream(o.stream),
|
||||
engine(o.engine),
|
||||
iteration_elements(o.iteration_elements),
|
||||
stride(o.stride),
|
||||
initial_offset(o.initial_offset),
|
||||
|
||||
bytes_per_element(o.bytes_per_element),
|
||||
batch_size(o.batch_size),
|
||||
bytes_batch_stride(o.bytes_batch_stride),
|
||||
bytes_iteration(o.bytes_iteration),
|
||||
bytes_iteration_stride(o.bytes_iteration_stride),
|
||||
bytes_iteration_initial_offset(o.bytes_iteration_initial_offset) {}
|
||||
|
||||
|
||||
static int64_t get_batch_size(layout mem_layout, int64_t axis) {
|
||||
if (axis < 0) {
|
||||
throw std::runtime_error("axis should be positive integer or zero");
|
||||
}
|
||||
|
||||
if (mem_layout.is_dynamic()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int64_t batch_size = 1;
|
||||
for (int64_t i = 0; i < axis; ++i) {
|
||||
batch_size *= mem_layout.get_tensor().raw[i];
|
||||
}
|
||||
for (int64_t i = axis-1; i >= 2; --i) {
|
||||
batch_size *= mem_layout.get_tensor().raw[i];
|
||||
}
|
||||
return batch_size;
|
||||
}
|
||||
|
||||
void calculate_concatenated_mem() const {
|
||||
if (!sliced_mems.empty() && concatenated_mem != nullptr) {
|
||||
auto& sliced_layout = sliced_mems.front()->get_layout();
|
||||
const int64_t num_elements_batch = get_batch_size(sliced_layout, axis);
|
||||
iteration_elements = sliced_layout.count() / num_elements_batch;
|
||||
bytes_per_element = data_type_traits::size_of(concatenated_mem->get_layout().data_type);
|
||||
batch_size = get_batch_size(concatenated_mem->get_layout(), axis);
|
||||
bytes_batch_stride = (static_cast<int64_t>(concatenated_mem->get_layout().count()) / batch_size) * bytes_per_element;
|
||||
bytes_iteration = iteration_elements * bytes_per_element;
|
||||
bytes_iteration_stride = stride * bytes_iteration;
|
||||
bytes_iteration_initial_offset = initial_offset * bytes_iteration;
|
||||
}
|
||||
}
|
||||
concat_data_prim(o.concat_data_prim),
|
||||
sliced_data_prim(o.sliced_data_prim),
|
||||
io_prim_map(o.io_prim_map) {}
|
||||
|
||||
void update_concatenated_mem(memory::ptr mem) {
|
||||
if (concatenated_mem != nullptr && concatenated_mem->get_layout() == mem->get_layout()) {
|
||||
concatenated_mem = mem;
|
||||
} else {
|
||||
concatenated_mem = mem;
|
||||
calculate_concatenated_mem();
|
||||
}
|
||||
concatenated_mem = mem;
|
||||
}
|
||||
|
||||
void restore_concatenated_mem() const {
|
||||
OPENVINO_ASSERT(concatenated_mem != nullptr, "concatenated_mem should not be nullptr");
|
||||
mem_lock<uint8_t> concat_mem_lock{ concatenated_mem, stream };
|
||||
int64_t iteration_offset = bytes_iteration_initial_offset;
|
||||
for (const auto& sliced_mem : sliced_mems) {
|
||||
// To support multi-batch, just repeat memcpy for each batch
|
||||
for (int64_t batch = 0; batch < batch_size; ++batch) {
|
||||
const int64_t src_offset = batch * bytes_iteration;
|
||||
const int64_t dst_offset = batch * bytes_batch_stride + iteration_offset;
|
||||
mem_lock<uint8_t> sliced_mem_lock{ sliced_mem, stream };
|
||||
uint8_t* src = sliced_mem_lock.data() + src_offset;
|
||||
uint8_t* dst = concat_mem_lock.data() + dst_offset;
|
||||
std::copy(src, src + bytes_iteration, dst);
|
||||
}
|
||||
iteration_offset += bytes_iteration_stride;
|
||||
}
|
||||
}
|
||||
void slice_mem(const int64_t num_iteration) const;
|
||||
void concat_mem(const int64_t curent_iterations) const;
|
||||
|
||||
// Get sliced mem for the iteration idx and copy data from external input to sliced mem
|
||||
// In the case of dynamic model, concatenated_mem is always non nullptr.
|
||||
memory::ptr get_sliced_mem(int64_t iteration) const {
|
||||
OPENVINO_ASSERT(!sliced_mems.empty(), "For input data, sliced_mems should not be empty");
|
||||
mem_lock<uint8_t, mem_lock_type::read> from_lock{ concatenated_mem, stream };
|
||||
int64_t batch_offset = 0;
|
||||
auto sliced_mem = get_or_create_sliced_mem(iteration, sliced_mems.front()->get_layout());
|
||||
const int64_t iteration_offset = bytes_iteration_initial_offset +
|
||||
bytes_iteration_stride * iteration;
|
||||
// To support multi-batch, just repeat memcpy for each batch
|
||||
for (int64_t batch = 0; batch < batch_size; ++batch) {
|
||||
const int64_t src_offset = batch_offset + iteration_offset;
|
||||
const int64_t dst_offset = batch * bytes_iteration;
|
||||
mem_lock<uint8_t> to_lock{ sliced_mem, stream };
|
||||
const auto src = from_lock.begin() + src_offset;
|
||||
const auto dst = to_lock.begin() + dst_offset;
|
||||
std::copy(src, src + bytes_iteration, dst);
|
||||
batch_offset += bytes_batch_stride;
|
||||
}
|
||||
return sliced_mem;
|
||||
OPENVINO_ASSERT(static_cast<size_t>(iteration) < sliced_mems.size(), "invalid itertion(", iteration,
|
||||
") for sliced_mes(", sliced_mems.size(), ")");
|
||||
return sliced_mems.at(iteration);;
|
||||
}
|
||||
|
||||
memory::ptr get_or_create_sliced_mem(int64_t idx, const layout& mem_layout) const {
|
||||
bool recalc_data = !sliced_mems.empty();
|
||||
while (sliced_mems.size() <= static_cast<size_t>(idx)) {
|
||||
memory::ptr sliced_mem = engine.allocate_memory(mem_layout, 0);
|
||||
sliced_mems.push_back(sliced_mem);
|
||||
}
|
||||
if (recalc_data) {
|
||||
calculate_concatenated_mem();
|
||||
}
|
||||
return sliced_mems.at(idx);
|
||||
}
|
||||
|
||||
@@ -252,78 +165,48 @@ public:
|
||||
std::vector<memory::ptr>& get_sliced_mems() const { return sliced_mems; }
|
||||
|
||||
void reset_data_for_shape_changed() {
|
||||
bytes_per_element = 0;
|
||||
batch_size = 0;
|
||||
bytes_batch_stride = 0;
|
||||
bytes_iteration = 0;
|
||||
bytes_iteration_stride = 0;
|
||||
bytes_iteration_initial_offset = 0;
|
||||
if (concatenated_mem) concatenated_mem = nullptr;
|
||||
iteration_elements = 0;
|
||||
sliced_mems.clear();
|
||||
}
|
||||
|
||||
const input_info& get_external_id() {
|
||||
return io_prim_map.external_id;
|
||||
}
|
||||
|
||||
std::string to_string() const {
|
||||
std::stringstream ss;
|
||||
ss << "concatenated_memory_mapping [" << std::endl;
|
||||
ss << "* axis : " << axis << std::endl;
|
||||
ss << "* bytes_per_element : " << bytes_per_element << std::endl;
|
||||
ss << "* batch_size : " << batch_size << std::endl;
|
||||
if (concatenated_mem != nullptr && concatenated_mem->get_layout().is_static()) {
|
||||
ss << "* bytes_batch_stride : " << bytes_batch_stride << " = (static_cast<int64_t>("
|
||||
<< concatenated_mem->get_layout().count() << ") / batch_size:" << batch_size << ") * bytes_per_element:" << bytes_per_element << std::endl;
|
||||
} else {
|
||||
ss << "* bytes_batch_stride : " << bytes_batch_stride << std::endl;
|
||||
}
|
||||
ss << "* bytes_iteration : " << bytes_iteration << " = (iteration_elements:"
|
||||
<< iteration_elements << " * bytes_per_element:" << bytes_per_element << ")" << std::endl;
|
||||
ss << "* bytes_iteration_stride : " << bytes_iteration_stride << std::endl;
|
||||
ss << "* bytes_iteration_initial_offset : " << bytes_iteration_initial_offset << std::endl;
|
||||
ss << "* concat_data_prim : " << ((concat_data_prim != nullptr)? concat_data_prim->id() : "nullptr") << std::endl;
|
||||
ss << "* sliced_data_prim : " << ((sliced_data_prim != nullptr)? sliced_data_prim->id() : "nullptr") << std::endl;
|
||||
if (concatenated_mem) {
|
||||
ss << "* concatenated_mem : " << concatenated_mem->get_layout().to_short_string() << std::endl;
|
||||
} else {
|
||||
ss << "* concatenated_mem : nullptr" << std::endl;
|
||||
}
|
||||
ss << "* iteration_elements : " << iteration_elements << std::endl;
|
||||
ss << "* stride : " << stride << std::endl;
|
||||
ss << "* initial_offset : " << initial_offset << std::endl;
|
||||
ss << "* input_info : " << concat_data_id.to_string() << std::endl;
|
||||
ss << "* concatenated_mem : "
|
||||
<< ((concatenated_mem != nullptr)? concatenated_mem->get_layout().to_short_string() : "nullptr") << std::endl;
|
||||
ss << "* sliced_mems :{ ";
|
||||
for (auto mem : sliced_mems) {
|
||||
ss << mem->get_layout().to_short_string() << ",";
|
||||
}
|
||||
ss << "* io_prim_map : " << io_prim_map.to_string() << std::endl;
|
||||
ss << "}]" << std::endl;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
const int64_t axis;
|
||||
std::shared_ptr<primitive_inst> concat_data_prim;
|
||||
std::shared_ptr<primitive_inst> sliced_data_prim;
|
||||
cldnn::input_info concat_data_id;
|
||||
std::shared_ptr<primitive_inst> get_sliced_data_prim() {
|
||||
OPENVINO_ASSERT(sliced_data_prim != nullptr, "sliced_data_prim should not be nullptr");
|
||||
return sliced_data_prim;
|
||||
}
|
||||
|
||||
primitive_id get_sliced_data_prim_id() {
|
||||
OPENVINO_ASSERT(sliced_data_prim != nullptr, "sliced_data_prim should not be nullptr");
|
||||
return sliced_data_prim->id();
|
||||
}
|
||||
|
||||
private:
|
||||
mutable memory::ptr concatenated_mem;
|
||||
mutable std::vector<memory::ptr> sliced_mems;
|
||||
cldnn::stream& stream;
|
||||
cldnn::engine& engine;
|
||||
mutable int64_t iteration_elements = 0;
|
||||
const int64_t stride = 0;
|
||||
const int64_t initial_offset = 0;
|
||||
|
||||
// element size
|
||||
mutable int64_t bytes_per_element;
|
||||
// number of higher level of dimension of slicing axis
|
||||
mutable int64_t batch_size;
|
||||
// stride of batch in concatenated memory
|
||||
mutable int64_t bytes_batch_stride;
|
||||
// byte size of each iteration per batch in a sliced memory
|
||||
mutable int64_t bytes_iteration;
|
||||
// byte size of each iteration (bytes_iteration * batch_size) in a sliced memory
|
||||
mutable int64_t bytes_iteration_stride;
|
||||
// byte offset of 1st iteration in a batch in a sliced memory
|
||||
mutable int64_t bytes_iteration_initial_offset;
|
||||
std::shared_ptr<primitive_inst> concat_data_prim;
|
||||
std::shared_ptr<primitive_inst> sliced_data_prim;
|
||||
const cldnn::loop::io_primitive_map& io_prim_map;
|
||||
};
|
||||
|
||||
struct backedge_memory_mapping {
|
||||
@@ -420,18 +303,17 @@ private:
|
||||
public:
|
||||
typed_primitive_inst(network& network, const loop_node& node);
|
||||
network::ptr get_body_network() const { return body_network; }
|
||||
void preprocess_input_memory(const int64_t trip_count);
|
||||
void preprocess_output_memory(const int64_t trip_count);
|
||||
void preprocess_input_memory(const int64_t num_iteration);
|
||||
void preprocess_output_memory(const int64_t num_iteration);
|
||||
void preprocess_backedge_memory();
|
||||
void update_mapped_memory();
|
||||
void update_input_mapped_memory();
|
||||
void update_output_mapped_memory();
|
||||
void update_backedge_mapped_memory();
|
||||
void postprocess_output_memory(bool is_dynamic);
|
||||
concatenated_memory_mapping::ptr create_concat_memory_map(const input_info& id,
|
||||
const cldnn::loop::io_primitive_map& io_prim_map,
|
||||
void postprocess_output_memory(bool is_dynamic, int64_t current_iteration);
|
||||
concatenated_memory_mapping::ptr create_concat_memory_map(const cldnn::loop::io_primitive_map& io_prim_map,
|
||||
memory::ptr mem_ptr,
|
||||
const int64_t trip_count);
|
||||
const int64_t num_iteration);
|
||||
event::ptr set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0) override;
|
||||
void reset_memory();
|
||||
|
||||
@@ -442,11 +324,23 @@ public:
|
||||
void update_shape() override { primitive_inst::update_shape(); }
|
||||
void update_output_layout();
|
||||
|
||||
// num_iteration is used for slicing input memory
|
||||
int64_t get_num_iterations();
|
||||
|
||||
std::vector<event::ptr> preprocess_memory_for_body_network(int64_t current_iteration_idx);
|
||||
std::vector<event::ptr> postprocess_memory_for_body_network(int64_t current_iteration_idx);
|
||||
|
||||
private:
|
||||
network::ptr body_network;
|
||||
memory::ptr get_external_memory(const primitive_id& external_id, size_t mem_idx = 0) const;
|
||||
layout get_external_output_layout(const primitive_id& external_id, size_t mem_idx = 0) const;
|
||||
std::shared_ptr<concatenated_memory_mapping> get_sliced_mem(const primitive_id& internal_id) const;
|
||||
int64_t calculate_num_iterations(const cldnn::loop::io_primitive_map& io_prim_map, ov::PartialShape& pshape);
|
||||
std::vector<event::ptr> handle_buffers_for_next_iteration(const backedge_memory_mapping& mapping,
|
||||
network::ptr body_network, int64_t iter);
|
||||
void set_memory_in_body_network(cldnn::network::ptr body_network, const std::shared_ptr<cldnn::primitive_inst>& inst,
|
||||
memory::ptr mem);
|
||||
|
||||
std::vector<loop::io_primitive_map> _input_primitive_maps;
|
||||
std::vector<loop::io_primitive_map> _output_primitive_maps;
|
||||
std::vector<loop::backedge_mapping> _back_edges;
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
#include <string>
|
||||
#include <exception>
|
||||
#include <algorithm>
|
||||
#include "openvino/reference/concat.hpp"
|
||||
#include "openvino/reference/split.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
GPU_DEFINE_PRIMITIVE_TYPE_ID(loop)
|
||||
@@ -40,61 +42,6 @@ std::map<size_t, memory::ptr> loop_node::get_memory_deps() const {
|
||||
return memory_deps;
|
||||
}
|
||||
|
||||
static size_t convert_to_raw_axis(size_t axis, size_t ndim) {
|
||||
// convert between bfyx, bfzyx, bfzyxw and tensor.size.raw
|
||||
if (axis >= ndim) {
|
||||
throw std::runtime_error("axis should be less than ndim");
|
||||
}
|
||||
|
||||
if (axis < 2) {
|
||||
return axis;
|
||||
}
|
||||
return (ndim - 1) - (axis - 2);
|
||||
}
|
||||
|
||||
static bool check_if_axis_is_set_properly(loop_node const & node) {
|
||||
const auto& input_primitive_maps = node.get_input_primitive_maps();
|
||||
|
||||
std::vector<std::reference_wrapper<const loop::io_primitive_map>> input_with_axis_iteration;
|
||||
for (const auto& input : input_primitive_maps) {
|
||||
if (input.axis >= 0) {
|
||||
input_with_axis_iteration.push_back(std::cref(input));
|
||||
}
|
||||
}
|
||||
|
||||
// check all iteration axis has the same size
|
||||
const std::vector<std::pair<program_node*, int32_t>>& dependencies = node.get_dependencies();
|
||||
int32_t iteration_size = -1;
|
||||
for (const auto& pm : input_with_axis_iteration) {
|
||||
auto found = std::find_if(dependencies.begin(), dependencies.end(),
|
||||
[&pm](const std::pair<program_node*, int32_t>& dep){ return dep.first->id() == pm.get().external_id.pid; });
|
||||
assert(found != dependencies.end());
|
||||
const layout input_layout = (*found).first->get_output_layout();
|
||||
const auto shape = input_layout.get_tensor().sizes(input_layout.format);
|
||||
const size_t iteration_axis = convert_to_raw_axis(pm.get().axis, static_cast<int32_t>(shape.size()));
|
||||
if (iteration_size < 0) {
|
||||
iteration_size = shape[iteration_axis];
|
||||
} else {
|
||||
if (iteration_size != shape[iteration_axis]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check if size of iteration axis is 1
|
||||
for (const auto& input_ref : input_with_axis_iteration) {
|
||||
const loop::io_primitive_map& input = input_ref.get();
|
||||
auto dep = std::find_if(dependencies.begin(), dependencies.end(),
|
||||
[&input](const std::pair<program_node*, int>& dep) { return input.external_id.pid == dep.first->id(); });
|
||||
|
||||
// if corresponding external id is not found
|
||||
if (dep == dependencies.end()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
layout loop_inst::calc_output_layout(loop_node const& /*node*/, kernel_impl_params const& impl_param) {
|
||||
auto prim = impl_param.typed_desc<loop>();
|
||||
|
||||
@@ -293,7 +240,7 @@ void loop_inst::update_input_mapped_memory() {
|
||||
bool is_concatenated_input = (input_map->axis >= 0);
|
||||
if (is_concatenated_input) {
|
||||
for (auto& mem_mapping : concatenated_input_mem_mappings) {
|
||||
if (mem_mapping->sliced_data_prim->id() == input_map->internal_id.pid) {
|
||||
if (mem_mapping->get_sliced_data_prim_id() == input_map->internal_id.pid) {
|
||||
mem_mapping->update_concatenated_mem(memory);
|
||||
break;
|
||||
}
|
||||
@@ -320,7 +267,7 @@ void loop_inst::update_output_mapped_memory() {
|
||||
body_network->get_primitive(internal_id)->set_output_memory(to_mem, true, internal_mem_idx);
|
||||
} else {
|
||||
for (auto& mem_mapping : concatenated_output_mem_mappings) {
|
||||
if (mem_mapping->sliced_data_prim->id() == internal_id) {
|
||||
if (mem_mapping->get_sliced_data_prim_id() == internal_id) {
|
||||
mem_mapping->update_concatenated_mem(to_mem);
|
||||
break;
|
||||
}
|
||||
@@ -398,49 +345,45 @@ event::ptr loop_inst::set_output_memory(memory::ptr mem, bool check, size_t idx)
|
||||
return ev;
|
||||
}
|
||||
|
||||
loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(const input_info& internal_id,
|
||||
const cldnn::loop::io_primitive_map& io_prim_map,
|
||||
memory::ptr mem_ptr,
|
||||
const int64_t trip_count) {
|
||||
loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(const cldnn::loop::io_primitive_map& io_prim_map,
|
||||
memory::ptr mem_ptr,
|
||||
const int64_t num_iterations) {
|
||||
const auto& external_id = io_prim_map.external_id;
|
||||
const auto& internal_id = io_prim_map.internal_id;
|
||||
auto& engine = body_network->get_engine();
|
||||
auto& stream = body_network->get_stream();
|
||||
auto prim = body_network->get_primitive(internal_id.pid);
|
||||
const int64_t start = io_prim_map.start < 0? trip_count - 1: io_prim_map.start;
|
||||
|
||||
std::vector<memory::ptr> sliced_mems;
|
||||
int64_t num_elements_iteration = 0;
|
||||
|
||||
// if memory is nullptr, that means memory is not allocated yet because current network is dynamic shape model.
|
||||
// In dynamic model, we can't calculate num_element_iteration, start, and sliced_layout.
|
||||
// will recalculate that parameters in backedge preprocessing map after first execution.
|
||||
if (mem_ptr != nullptr) {
|
||||
layout sliced_layout = prim->output_memory(internal_id.idx).get_layout();
|
||||
auto& out_mem = prim->output_memory(internal_id.idx);
|
||||
layout sliced_layout = out_mem.get_layout();
|
||||
|
||||
// When trip_count is -1, allocate first sliced_mem and allocate sliced memory if additional sliced mem is required
|
||||
if (trip_count < 0) {
|
||||
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0);
|
||||
if (num_iterations < 0) {
|
||||
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
|
||||
sliced_mems.push_back(sliced_mem);
|
||||
} else {
|
||||
sliced_mems.reserve(trip_count);
|
||||
for (int j=0; j < trip_count; ++j) {
|
||||
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout, 0);
|
||||
sliced_mems.reserve(num_iterations);
|
||||
for (int j=0; j < num_iterations; ++j) {
|
||||
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
|
||||
sliced_mems.push_back(sliced_mem);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t num_elements_batch = concatenated_memory_mapping::get_batch_size(
|
||||
sliced_layout, io_prim_map.axis);
|
||||
num_elements_iteration = sliced_layout.count() / num_elements_batch;
|
||||
}
|
||||
|
||||
auto concat_memory_mapping = std::make_shared<concatenated_memory_mapping>(
|
||||
io_prim_map.axis, mem_ptr, sliced_mems, stream,
|
||||
engine, num_elements_iteration, io_prim_map.stride, start);
|
||||
concat_memory_mapping->sliced_data_prim = body_network->get_primitive(internal_id.pid);
|
||||
return concat_memory_mapping;
|
||||
auto sliced_data_prim = body_network->get_primitive(internal_id.pid);
|
||||
auto concat_data_prim = get_network().get_primitive(external_id.pid);
|
||||
auto concat_data_id = external_id;
|
||||
return std::make_shared<concatenated_memory_mapping>(mem_ptr, sliced_mems, stream, engine,
|
||||
concat_data_prim, sliced_data_prim, io_prim_map);
|
||||
}
|
||||
|
||||
void loop_inst::preprocess_output_memory(const int64_t trip_count) {
|
||||
void loop_inst::preprocess_output_memory(const int64_t num_iterations) {
|
||||
if (concatenated_output_mem_mappings.empty())
|
||||
concatenated_output_mem_mappings.reserve(_output_primitive_maps.size());
|
||||
for (size_t i = 0; i < _output_primitive_maps.size(); ++i) {
|
||||
@@ -459,12 +402,10 @@ void loop_inst::preprocess_output_memory(const int64_t trip_count) {
|
||||
} else {
|
||||
auto iter = std::find_if(concatenated_output_mem_mappings.begin(), concatenated_output_mem_mappings.end(),
|
||||
[&](loop_inst::concatenated_memory_mapping::ptr concat_mem_map) -> bool {
|
||||
return concat_mem_map->sliced_data_prim->id() == internal_id.pid;
|
||||
return concat_mem_map->get_sliced_data_prim_id() == internal_id.pid;
|
||||
});
|
||||
if (iter == concatenated_output_mem_mappings.end()) {
|
||||
auto memory_mapping_info = create_concat_memory_map(internal_id, output_mapping, memory, trip_count);
|
||||
memory_mapping_info->concat_data_prim = get_network().get_primitive(external_id.pid);
|
||||
memory_mapping_info->concat_data_id = external_id;
|
||||
auto memory_mapping_info = create_concat_memory_map(output_mapping, memory, num_iterations);
|
||||
concatenated_output_mem_mappings.push_back(memory_mapping_info);
|
||||
GPU_DEBUG_LOG << i << ") generate concat output memory mapping: " << memory_mapping_info->to_string() << std::endl;
|
||||
}
|
||||
@@ -475,7 +416,7 @@ void loop_inst::preprocess_output_memory(const int64_t trip_count) {
|
||||
}
|
||||
}
|
||||
|
||||
void loop_inst::preprocess_input_memory(const int64_t trip_count) {
|
||||
void loop_inst::preprocess_input_memory(const int64_t num_iterations) {
|
||||
for (size_t memory_num = 0; memory_num < inputs_memory_count(); memory_num++) {
|
||||
const primitive_id& input_external_id = dependencies().at(memory_num).first->id();
|
||||
auto input_map_ptrs = find_io_primitive_maps(_input_primitive_maps,
|
||||
@@ -499,13 +440,7 @@ void loop_inst::preprocess_input_memory(const int64_t trip_count) {
|
||||
GPU_DEBUG_LOG << i << ") input mapping - external " << external_id.to_string() << std::endl;
|
||||
GPU_DEBUG_LOG << i << ") input mapping - internal " << internal_id.to_string() << std::endl;
|
||||
|
||||
if (input_map->axis >= 0) {
|
||||
OPENVINO_ASSERT(trip_count > 0, "In preprocessing concat input mapping, trip_count should be positive");
|
||||
OPENVINO_ASSERT(memory != nullptr, "In preprocessing concat input mapping, concat memory should be allocated");
|
||||
auto memory_mapping_info = create_concat_memory_map(internal_id, *input_map, memory, trip_count);
|
||||
concatenated_input_mem_mappings.push_back(memory_mapping_info);
|
||||
GPU_DEBUG_LOG << i << ") generate concat input memory mapping: " << memory_mapping_info->to_string() << std::endl;
|
||||
} else {
|
||||
if (input_map->axis < 0) {
|
||||
auto input_inst = body_network->get_primitive(internal_id.pid);
|
||||
if (memory->get_layout() != input_inst->get_output_layout()) {
|
||||
input_inst->set_output_layout(memory->get_layout());
|
||||
@@ -514,6 +449,11 @@ void loop_inst::preprocess_input_memory(const int64_t trip_count) {
|
||||
<< " to " << memory->get_layout().to_short_string() << std::endl;
|
||||
}
|
||||
body_network->set_input_data(internal_id.pid, memory);
|
||||
} else {
|
||||
OPENVINO_ASSERT(memory != nullptr, "In preprocessing concat input mapping, concat memory should be allocated");
|
||||
auto memory_mapping_info = create_concat_memory_map(*input_map, memory, num_iterations);
|
||||
concatenated_input_mem_mappings.push_back(memory_mapping_info);
|
||||
GPU_DEBUG_LOG << i << ") generate concat input memory mapping: " << memory_mapping_info->to_string() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -605,12 +545,12 @@ void loop_inst::preprocess_backedge_memory() {
|
||||
|
||||
std::shared_ptr<loop_inst::concatenated_memory_mapping> loop_inst::get_sliced_mem(const primitive_id& internal_id) const {
|
||||
for (const auto& mem_mapping : concatenated_input_mem_mappings) {
|
||||
if (mem_mapping->sliced_data_prim->id() == internal_id) {
|
||||
if (mem_mapping->get_sliced_data_prim_id() == internal_id) {
|
||||
return mem_mapping;
|
||||
}
|
||||
}
|
||||
for (const auto& mem_mapping : concatenated_output_mem_mappings) {
|
||||
if (mem_mapping->sliced_data_prim->id() == internal_id) {
|
||||
if (mem_mapping->get_sliced_data_prim_id() == internal_id) {
|
||||
return mem_mapping;
|
||||
}
|
||||
}
|
||||
@@ -625,7 +565,10 @@ void loop_inst::validate_backedges(loop_node const & node) const {
|
||||
for (const auto& back_edge : back_edges) {
|
||||
for (const auto& mapping : input_primitive_maps) {
|
||||
OPENVINO_ASSERT((mapping.internal_id.pid != back_edge.to || mapping.axis < 0),
|
||||
node.id(), ": input with iteration axis should not have backedges");
|
||||
node.id(), ": input with iteration axis should not have backedges external_id: ",
|
||||
mapping.external_id.to_string(), ", internal_id: ", mapping.internal_id.to_string(),
|
||||
", back_edge.to: ", back_edge.to, ", back_edge.from ", back_edge.from,
|
||||
", mapping.axis: ", std::to_string(mapping.axis));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -653,8 +596,6 @@ loop_inst::typed_primitive_inst(network & network, loop_node const & node)
|
||||
const primitive_id& num_iterations_id = node.get_num_iterations_id();
|
||||
OPENVINO_ASSERT(node.get_program().get_node(num_iterations_id).is_type<mutable_data>(),
|
||||
node.id(), ": num_iterations is not mutable_data");
|
||||
OPENVINO_ASSERT(check_if_axis_is_set_properly(node), node.id(), ": axis is not set properly");
|
||||
|
||||
set_inner_networks({body_network});
|
||||
validate_backedges(node);
|
||||
validate_mappings(node);
|
||||
@@ -694,9 +635,11 @@ void loop_inst::load(BinaryInputBuffer& ib) {
|
||||
ib >> _condition_id;
|
||||
ib >> _num_iterations_id;
|
||||
body_network = std::make_shared<cldnn::network>(ib, get_network().get_stream_ptr(), get_network().get_engine(), get_network().is_primary_stream(), 0);
|
||||
// set inner network to the new loaded _impl_params from cache.
|
||||
set_inner_networks({body_network});
|
||||
}
|
||||
|
||||
void loop_inst::postprocess_output_memory(bool is_dynamic) {
|
||||
void loop_inst::postprocess_output_memory(bool is_dynamic, int64_t current_iteration) {
|
||||
if (is_dynamic) {
|
||||
std::vector<cldnn::memory::ptr> external_outputs;
|
||||
external_outputs.resize(outputs_memory_count());
|
||||
@@ -733,7 +676,7 @@ void loop_inst::postprocess_output_memory(bool is_dynamic) {
|
||||
auto iter = std::find_if(concatenated_output_mem_mappings.begin(),
|
||||
concatenated_output_mem_mappings.end(),
|
||||
[&](std::shared_ptr<loop_inst::concatenated_memory_mapping> &concat_output){
|
||||
return concat_output->concat_data_id == external_id;
|
||||
return concat_output->get_external_id() == external_id;
|
||||
});
|
||||
if (iter != concatenated_output_mem_mappings.end()) {
|
||||
(*iter)->update_concatenated_mem(concat_mem);
|
||||
@@ -748,7 +691,7 @@ void loop_inst::postprocess_output_memory(bool is_dynamic) {
|
||||
|
||||
for (size_t i = 0; i < concatenated_output_mem_mappings.size(); ++i) {
|
||||
const auto& concat_output = concatenated_output_mem_mappings.at(i);
|
||||
concat_output->restore_concatenated_mem();
|
||||
concat_output->concat_mem(current_iteration);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -793,4 +736,282 @@ void loop_inst::update_output_layout() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void loop_inst::concatenated_memory_mapping::slice_mem(const int64_t num_iterations) const {
|
||||
size_t num_iters = static_cast<size_t>(num_iterations);
|
||||
OPENVINO_ASSERT(num_iters > 0 && num_iters == sliced_mems.size(), "num_iterations(", num_iters,
|
||||
") should be same with sliced_mems.size(", sliced_mems.size(), ")");
|
||||
OPENVINO_ASSERT(concatenated_mem != nullptr, "concatenated_mem should not be nullptr");
|
||||
|
||||
auto elem_size = ov::element::Type(concatenated_mem->get_layout().data_type).size();
|
||||
auto concat_mem_shape = concatenated_mem->get_layout().get_shape();
|
||||
auto sliced_mem_shape = sliced_mems.front()->get_layout().get_shape();
|
||||
const auto stride = io_prim_map.stride;
|
||||
const auto axis = io_prim_map.axis;
|
||||
const auto step = std::abs(stride);
|
||||
OPENVINO_ASSERT((static_cast<size_t>(step) == sliced_mem_shape[axis])
|
||||
&& (concat_mem_shape[axis] >= num_iterations * sliced_mem_shape[axis]),
|
||||
"slice_mem: concat_mem_shape[axis(", axis, "),step(", step, ")](",
|
||||
concat_mem_shape.to_string(), ") != num_iterations(",
|
||||
num_iterations, ") * sliced_mem_shape[axis](", sliced_mem_shape.to_string(), ")");
|
||||
std::vector<char*> pointers_to_data(num_iters);
|
||||
for (size_t i = 0; i < num_iters; i++) {
|
||||
auto mem = sliced_mems[i];
|
||||
pointers_to_data[stride > 0 ? i : (num_iters - i - 1)] = reinterpret_cast<char*>(mem->lock(stream));
|
||||
}
|
||||
char* concat_data = reinterpret_cast<char*>(concatenated_mem->lock(stream, cldnn::mem_lock_type::read));
|
||||
ov::reference::split(concat_data, concat_mem_shape, elem_size, axis, num_iters, pointers_to_data.data());
|
||||
|
||||
for (size_t i = 0; i < num_iters; i++) {
|
||||
sliced_mems[i]->unlock(stream);
|
||||
}
|
||||
concatenated_mem->unlock(stream);
|
||||
GPU_DEBUG_LOG << "slice memory [" << io_prim_map.to_short_string() << "] from concat_mem["
|
||||
<< concatenated_mem->get_layout().to_short_string()
|
||||
<< "], current_iteration: " << num_iterations << ", stride: " << stride
|
||||
<< " to sliced_mems[" << sliced_mems.front()->get_layout().to_short_string() << "]" << std::endl;
|
||||
}
|
||||
|
||||
void loop_inst::concatenated_memory_mapping::concat_mem(const int64_t curent_iterations) const {
|
||||
size_t curr_iters = static_cast<size_t>(curent_iterations);
|
||||
OPENVINO_ASSERT(sliced_mems.size() >= curr_iters, "curent_iterations(", curr_iters,
|
||||
") should be less than the number of sliced_mems(", sliced_mems.size(), ")");
|
||||
OPENVINO_ASSERT(concatenated_mem != nullptr, "concatenated_mem should not be nullptr");
|
||||
|
||||
auto elem_size = ov::element::Type(concatenated_mem->get_layout().data_type).size();
|
||||
auto concat_mem_shape = concatenated_mem->get_layout().get_shape();
|
||||
auto sliced_mem_shape = sliced_mems.front()->get_layout().get_shape();
|
||||
const auto stride = io_prim_map.stride;
|
||||
const auto axis = io_prim_map.axis;
|
||||
const auto step = std::abs(stride);
|
||||
OPENVINO_ASSERT((static_cast<size_t>(step) == sliced_mem_shape[axis])
|
||||
&& (concat_mem_shape[axis] >= curent_iterations * sliced_mem_shape[axis]),
|
||||
"concat_mem: concat_mem_shape[axis(", axis, "),step(", step, ")](",
|
||||
concat_mem_shape.to_string(), ") != curent_iterations(",
|
||||
curent_iterations, ") * sliced_mem_shape[axis](", sliced_mem_shape.to_string(), ")");
|
||||
std::vector<ov::Shape> shapes_to_concat(curr_iters, sliced_mem_shape);
|
||||
std::vector<const char*> pointers_to_data(curr_iters);
|
||||
for (size_t i = 0; i < curr_iters; i++) {
|
||||
auto mem = sliced_mems[i];
|
||||
pointers_to_data[stride > 0 ? i : (curr_iters - i - 1)] = reinterpret_cast<const char*>(mem->lock(stream));
|
||||
}
|
||||
|
||||
char* concat_data = reinterpret_cast<char*>(concatenated_mem->lock(stream));
|
||||
ov::reference::concat(pointers_to_data, concat_data, shapes_to_concat, concat_mem_shape, axis, elem_size);
|
||||
|
||||
for (size_t i = 0; i < curr_iters; i++) {
|
||||
sliced_mems[i]->unlock(stream);
|
||||
}
|
||||
concatenated_mem->unlock(stream);
|
||||
GPU_DEBUG_LOG << "concatenate memory [" << io_prim_map.to_short_string() << "] from sliced_mems["
|
||||
<< sliced_mems.front()->get_layout().to_short_string() << "], current_iteration: "
|
||||
<< curent_iterations << ", stride: " << stride << " to concat_mem["
|
||||
<< concatenated_mem->get_layout().to_short_string() << "]" << std::endl;
|
||||
}
|
||||
|
||||
int64_t loop_inst::calculate_num_iterations(const cldnn::loop::io_primitive_map& io_prim_map,
|
||||
ov::PartialShape& pshape) {
|
||||
OPENVINO_ASSERT(io_prim_map.stride != 0, "stride should not be zero");
|
||||
const auto space = pshape[io_prim_map.axis].get_length();
|
||||
const auto start = (io_prim_map.start < 0? (space + 1) : 0) + io_prim_map.start;
|
||||
const auto end = (io_prim_map.end < 0? (space + 1) : 0) + io_prim_map.end;
|
||||
const auto step = std::abs(io_prim_map.stride);
|
||||
const auto src = io_prim_map.stride < 0 ? end : start;
|
||||
const auto dst = io_prim_map.stride < 0 ? start : end;
|
||||
const auto len = dst - src;
|
||||
OPENVINO_ASSERT(src >= 0 && dst > src && dst <= space && len >= static_cast<long>(step),
|
||||
"invalid values in an iteration component start:",
|
||||
io_prim_map.start, ", end: ", io_prim_map.end, ", stride:",
|
||||
io_prim_map.stride, ", axis: ", io_prim_map.axis, ", dst: ",
|
||||
dst, ", src: ", src, ", space: ", space, ", len: ",
|
||||
len, ", step: ", step, ", pshape: ", pshape.to_string());
|
||||
OPENVINO_ASSERT(len % step == 0, "Each iteration should have same size: length(", len, ") % step(", step, ")");
|
||||
int64_t num_iterations = static_cast<int64_t>(len / step);
|
||||
{
|
||||
GPU_DEBUG_LOG << "Caculate num_iterations ..." << std::endl;
|
||||
GPU_DEBUG_LOG << "* io_prim_map.{start:" << io_prim_map.start << ", end:" << io_prim_map.end
|
||||
<< ", stride: " << io_prim_map.stride << ", axis: " << io_prim_map.axis << "}" << std::endl;
|
||||
GPU_DEBUG_LOG << "* pshape : " << pshape.to_string() << std::endl;
|
||||
GPU_DEBUG_LOG << "* space : " << space << std::endl;
|
||||
GPU_DEBUG_LOG << "* start : " << start << std::endl;
|
||||
GPU_DEBUG_LOG << "* end : " << end << std::endl;
|
||||
GPU_DEBUG_LOG << "* step : " << step << std::endl;
|
||||
GPU_DEBUG_LOG << "* src : " << src << std::endl;
|
||||
GPU_DEBUG_LOG << "* dst : " << dst << std::endl;
|
||||
GPU_DEBUG_LOG << "* len : " << len << std::endl;
|
||||
GPU_DEBUG_LOG << "* num_iterations : " << num_iterations << std::endl;
|
||||
}
|
||||
return num_iterations;
|
||||
}
|
||||
|
||||
int64_t loop_inst::get_num_iterations() {
|
||||
int64_t num_iterations = -1;
|
||||
bool is_default_num_iter = true;
|
||||
for (auto& input_map : _input_primitive_maps) {
|
||||
if (input_map.axis == -1)
|
||||
continue;
|
||||
const auto& external_id = input_map.external_id;
|
||||
auto exteranl_input_inst = get_network().get_primitive(external_id.pid);
|
||||
auto concat_shape = exteranl_input_inst->get_output_layout(external_id.idx).get_partial_shape();
|
||||
|
||||
if (concat_shape[input_map.axis].get_length() == 0)
|
||||
continue;
|
||||
|
||||
const auto current_num_iterations = calculate_num_iterations(input_map, concat_shape);
|
||||
if (is_default_num_iter) {
|
||||
is_default_num_iter = false;
|
||||
num_iterations = current_num_iterations;
|
||||
}
|
||||
OPENVINO_ASSERT(num_iterations == current_num_iterations,
|
||||
"iteration num shuld be same between ", num_iterations, " and ", current_num_iterations);
|
||||
}
|
||||
|
||||
for (auto& output_map : _output_primitive_maps) {
|
||||
if (output_map.axis == -1)
|
||||
continue;
|
||||
|
||||
const auto& external_id = output_map.external_id;
|
||||
auto exteranl_output_inst = get_network().get_primitive(external_id.pid);
|
||||
auto concat_shape = exteranl_output_inst->get_output_layout(external_id.idx).get_partial_shape();
|
||||
|
||||
if (concat_shape[output_map.axis].is_dynamic() || concat_shape[output_map.axis].get_length() == 0)
|
||||
continue;
|
||||
|
||||
const auto current_num_iterations = calculate_num_iterations(output_map, concat_shape);
|
||||
if (is_default_num_iter) {
|
||||
is_default_num_iter = false;
|
||||
num_iterations = current_num_iterations;
|
||||
}
|
||||
OPENVINO_ASSERT(num_iterations == current_num_iterations,
|
||||
"iteration num shuld be same between ", num_iterations, " and ", current_num_iterations);
|
||||
}
|
||||
return num_iterations;
|
||||
}
|
||||
|
||||
void loop_inst::set_memory_in_body_network(cldnn::network::ptr body_network,
|
||||
const std::shared_ptr<cldnn::primitive_inst>& inst, memory::ptr mem) {
|
||||
if (inst->is_input()) {
|
||||
body_network->set_input_data(inst->id(), mem);
|
||||
} else if (inst->is_output()) {
|
||||
body_network->set_output_memory(inst->id(), mem);
|
||||
} else {
|
||||
inst->set_output_memory(mem, false);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<event::ptr> loop_inst::handle_buffers_for_next_iteration(const loop_inst::backedge_memory_mapping& mapping,
|
||||
network::ptr body_network, int64_t iter) {
|
||||
std::vector<event::ptr> event_vec;
|
||||
OPENVINO_ASSERT(iter >= 0, "iteration should not be negative : ", iter);
|
||||
if (mapping.type == loop_inst::backedge_memory_mapping::CONCAT_OUTPUT) {
|
||||
if (iter == 0) {
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, mapping.initial_mem);
|
||||
} else if (iter > 0) {
|
||||
if (is_dynamic()) {
|
||||
auto from_id = mapping.from_primitive->id();
|
||||
if (body_network->has_event(from_id)) {
|
||||
auto ev = body_network->get_primitive_event(from_id);
|
||||
if (ev) ev->wait();
|
||||
}
|
||||
// In dynamic model, just copy data from inner body output to inner body input in back_edges.
|
||||
memory::ptr to_mem = mapping.to_primitive->output_memory_ptr();
|
||||
memory::ptr from_mem = mapping.from_primitive->output_memory_ptr();
|
||||
auto ev = to_mem->copy_from(body_network->get_stream(), *(from_mem));
|
||||
if (ev) event_vec = {ev};
|
||||
} else {
|
||||
auto mem = mapping.concat_mem_mapping->get_sliced_mems().at(iter - 1);
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, mem);
|
||||
}
|
||||
}
|
||||
} else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE_SHARED) {
|
||||
if (iter == 0) {
|
||||
if (mapping.from_mem != nullptr) {
|
||||
auto ev = mapping.from_mem->copy_from(body_network->get_stream(), *(mapping.initial_mem));
|
||||
if (ev) event_vec = {ev};
|
||||
}
|
||||
} else {
|
||||
// In dynamic model, output memory is not defined before execution.
|
||||
// After body network execution, replace input memory from initial_mem(external input memory) to output memory.
|
||||
if (mapping.from_mem == nullptr) {
|
||||
mapping.from_mem = mapping.from_primitive->output_memory_ptr();
|
||||
OPENVINO_ASSERT(mapping.from_mem != nullptr, "from_mem should not be null");
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, mapping.from_mem);
|
||||
}
|
||||
}
|
||||
} else if (mapping.type == loop_inst::backedge_memory_mapping::SINGLE) {
|
||||
memory::ptr to_mem = mapping.to_primitive->output_memory_ptr();
|
||||
if (iter == 0) {
|
||||
auto ev = to_mem->copy_from(body_network->get_stream(), *(mapping.initial_mem));
|
||||
if (ev) event_vec = {ev};
|
||||
} else {
|
||||
if (is_dynamic()) {
|
||||
// In dynamic model, do not set memory buffer between input and output in inner body network.
|
||||
// Just copy data from input buffer memory to output buffer memory.
|
||||
auto from_id = mapping.from_primitive->id();
|
||||
if (body_network->has_event(from_id)) {
|
||||
auto ev = body_network->get_primitive_event(from_id);
|
||||
if (ev) ev->wait();
|
||||
}
|
||||
memory::ptr from_mem = mapping.from_primitive->output_memory_ptr();
|
||||
auto ev = to_mem->copy_from(body_network->get_stream(), *(from_mem));
|
||||
if (ev) event_vec = {ev};
|
||||
} else {
|
||||
// In static model, swap memory buffer between output and input in inner body network
|
||||
memory::ptr from_mem = mapping.from_primitive->output_memory_ptr();
|
||||
set_memory_in_body_network(body_network, mapping.to_primitive, std::move(from_mem));
|
||||
set_memory_in_body_network(body_network, mapping.from_primitive, std::move(to_mem));
|
||||
}
|
||||
}
|
||||
}
|
||||
return event_vec;
|
||||
}
|
||||
|
||||
std::vector<event::ptr> loop_inst::preprocess_memory_for_body_network(int64_t current_iteration_idx) {
|
||||
std::vector<event::ptr> events;
|
||||
// Copy & Set sliced input memory
|
||||
for (size_t i = 0; i < concatenated_input_mem_mappings.size(); ++i) {
|
||||
const auto& concatenated_input = concatenated_input_mem_mappings.at(i);
|
||||
memory::ptr mem = concatenated_input->get_sliced_mem(current_iteration_idx);
|
||||
OPENVINO_ASSERT(mem != nullptr, id(), " sliced input memory of loop is not allocated properly");
|
||||
concatenated_input->get_sliced_data_prim()->set_output_memory(mem);
|
||||
}
|
||||
|
||||
// Set backedges and output memory
|
||||
for (auto& backedge_memory_mapping : backedge_memory_mappings) {
|
||||
auto event_vec = handle_buffers_for_next_iteration(backedge_memory_mapping, body_network, current_iteration_idx);
|
||||
for (auto ev : event_vec) {
|
||||
events.push_back(ev);
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_dynamic()) {
|
||||
// Set sliced output memory for static shape model
|
||||
// because body network generate output memory during the body network execution in dynamic model
|
||||
for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) {
|
||||
concat_output_mem_mapping->setup_sliced_output_memory(current_iteration_idx);
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
std::vector<event::ptr> loop_inst::postprocess_memory_for_body_network(int64_t current_iteration_idx) {
|
||||
std::vector<event::ptr> events;
|
||||
for (const auto& concat_output_mem_mapping : concatenated_output_mem_mappings) {
|
||||
auto sliced_data_prim = concat_output_mem_mapping->get_sliced_data_prim();
|
||||
auto output_mem_ptr = sliced_data_prim->output_memory_ptr();
|
||||
|
||||
auto sliced_id = sliced_data_prim->id();
|
||||
if (body_network->has_event(sliced_id)) {
|
||||
auto ev = body_network->get_primitive_event(sliced_id);
|
||||
if (ev) ev->wait();
|
||||
}
|
||||
memory::ptr new_sliced_mem = concat_output_mem_mapping->get_or_create_sliced_mem(current_iteration_idx,
|
||||
output_mem_ptr->get_layout());
|
||||
auto ev = new_sliced_mem->copy_from(body_network->get_stream(), *output_mem_ptr);
|
||||
if (ev) {
|
||||
events.push_back(ev);
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
} // namespace cldnn
|
||||
|
||||
@@ -199,7 +199,6 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
|
||||
bool is_dynamic = p.use_new_shape_infer() || op->is_dynamic();
|
||||
|
||||
int64_t num_iterations = op->get_num_iterations();
|
||||
OPENVINO_ASSERT((is_dynamic || num_iterations > 0), "loop's num_iteration should be positive on static shape model");
|
||||
|
||||
auto num_outputs = is_dynamic? op->get_output_size() : 1;
|
||||
auto ov_model = op->get_function();
|
||||
@@ -280,7 +279,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
|
||||
|
||||
// get body program from ov::Model
|
||||
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
|
||||
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
|
||||
auto body_program = prog.get_compiled_program();
|
||||
|
||||
GPU_DEBUG_LOG << "* trip_count_id : " << trip_count_id << std::endl;
|
||||
|
||||
@@ -28,7 +28,8 @@ static program::ptr build_program(engine& engine,
|
||||
topology& body_topology,
|
||||
primitive_id execution_condition_id,
|
||||
std::vector<loop::io_primitive_map> output_primitive_maps,
|
||||
std::vector<loop::backedge_mapping> back_edges) {
|
||||
std::vector<loop::backedge_mapping> back_edges,
|
||||
bool allow_new_shape_infer = false) {
|
||||
std::vector<cldnn::primitive_id> output_names_vec;
|
||||
for (auto out_map : output_primitive_maps) {
|
||||
output_names_vec.push_back(out_map.internal_id.pid);
|
||||
@@ -48,6 +49,7 @@ static program::ptr build_program(engine& engine,
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::custom_outputs(output_names_vec));
|
||||
config.set_property(ov::intel_gpu::max_dynamic_batch(1));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(allow_new_shape_infer));
|
||||
|
||||
return program::build_program(engine, body_topology, config, false, false, true);
|
||||
}
|
||||
@@ -296,7 +298,7 @@ void test_loop_gpu_basic_concat_nested(bool is_caching_test)
|
||||
// set inner loop body
|
||||
/////////////////////////////////
|
||||
topology inner_loop_body(
|
||||
input_layout("inner_input", input_mem->get_layout()),
|
||||
input_layout("inner_input", { { 1, 1, 1, 4 }, data_types::f32, format::bfyx }),
|
||||
data("inner_eltwise_operand", inner_operand_mem),
|
||||
eltwise("inner_eltwise", input_info("inner_input"), input_info("inner_eltwise_operand"), eltwise_mode::sum)
|
||||
);
|
||||
@@ -428,3 +430,121 @@ TEST(loop_gpu, basic_concat_cached) {
|
||||
TEST(loop_gpu, basic_concat_nested_cached) {
|
||||
test_loop_gpu_basic_concat_nested<float>(true);
|
||||
}
|
||||
|
||||
static void test_loop_gpu_wo_trip_count(bool is_caching_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto e_input_layout = cldnn::layout{ { 1, 1, 5, 4 }, data_types::f32, format::bfyx };
|
||||
auto b_input_layout = cldnn::layout{ { 1, 1, 1, 4}, data_types::f32, format::bfyx };
|
||||
auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx };
|
||||
|
||||
auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y
|
||||
auto e_initial_condition_mem = engine.allocate_memory(const_layout);
|
||||
auto e_num_iteration_mem = engine.allocate_memory(const_layout);
|
||||
auto b_exit_value_mem = engine.allocate_memory(const_layout);
|
||||
auto b_index_inc_mem = engine.allocate_memory(const_layout);
|
||||
|
||||
std::vector<float> input_data{
|
||||
1.0f, 2.0f, -15.f, 3.0f,
|
||||
4.0f, -15.f, 5.0f, 6.0f,
|
||||
-15.f, 7.0f, -15.f, 0.0f,
|
||||
0.0f, -15.f, 0.5f, -0.5f,
|
||||
-15.f, 8.0f, 1.5f, 5.2f
|
||||
};
|
||||
|
||||
const int64_t exit_value = 3;
|
||||
|
||||
// initialize input buffers
|
||||
set_values(e_input_mem, input_data);
|
||||
set_values(e_initial_condition_mem, {1});
|
||||
set_values(b_exit_value_mem, {exit_value});
|
||||
set_values(b_index_inc_mem, {1});
|
||||
|
||||
primitive_id body_current_iteration_id = "b_index";
|
||||
primitive_id body_execution_condition_id = "b_cond_exit_value";
|
||||
|
||||
cldnn::topology body(
|
||||
input_layout(body_current_iteration_id, const_layout),
|
||||
input_layout("b_add_data", b_input_layout),
|
||||
input_layout("b_mul_data", b_input_layout),
|
||||
data("b_exit_value", b_exit_value_mem),
|
||||
data("b_index_inc", b_index_inc_mem),
|
||||
eltwise("b_index_update", input_info(body_current_iteration_id), input_info("b_index_inc"), eltwise_mode::sum),
|
||||
reorder("b_index_cast", input_info("b_index_update"),
|
||||
cldnn::format::any, data_types::f32, {}, cldnn::reorder_mean_mode::subtract, cldnn::padding(), true),
|
||||
eltwise(body_execution_condition_id, input_info("b_index"), input_info("b_exit_value"), eltwise_mode::lt),
|
||||
eltwise("b_add", input_info("b_add_data"), input_info("b_index_cast"), eltwise_mode::sum),
|
||||
eltwise("b_mul", input_info("b_mul_data"), input_info("b_index_cast"), eltwise_mode::prod)
|
||||
);
|
||||
|
||||
primitive_id trip_count_id = "";
|
||||
primitive_id actual_iteration_count_id = "actual_iteration_count";
|
||||
primitive_id initial_condition_id = "initial_condition";
|
||||
int64_t num_iterations = -1;
|
||||
|
||||
std::vector<loop::io_primitive_map> input_primitive_maps {
|
||||
loop::io_primitive_map("input", "b_add_data", 2),
|
||||
loop::io_primitive_map("input", "b_mul_data", 2),
|
||||
loop::io_primitive_map(actual_iteration_count_id, body_current_iteration_id) };
|
||||
std::vector<loop::io_primitive_map> output_primitive_maps {
|
||||
loop::io_primitive_map(cldnn::input_info("loop", 0), cldnn::input_info("b_add", 0), 2),
|
||||
loop::io_primitive_map(cldnn::input_info("loop", 1), cldnn::input_info("b_mul", 0), 2) };
|
||||
std::vector<loop::backedge_mapping> back_edges {
|
||||
loop::backedge_mapping("b_index_update", body_current_iteration_id) };
|
||||
|
||||
auto body_program = build_program(engine, body, body_execution_condition_id, output_primitive_maps, back_edges, true);
|
||||
|
||||
cldnn::topology topology(
|
||||
input_layout("input", e_input_layout),
|
||||
input_layout(initial_condition_id, e_initial_condition_mem->get_layout()),
|
||||
mutable_data(actual_iteration_count_id, e_num_iteration_mem),
|
||||
loop("loop", { input_info(actual_iteration_count_id), input_info("input") }, body_program,
|
||||
trip_count_id, initial_condition_id, actual_iteration_count_id,
|
||||
input_primitive_maps, output_primitive_maps, back_edges,
|
||||
num_iterations, body_current_iteration_id, body_execution_condition_id, 2),
|
||||
eltwise("out_sum", input_info("loop", 0), input_info("loop", 1), eltwise_mode::sum)
|
||||
);
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
|
||||
network->set_input_data("input", e_input_mem);
|
||||
network->set_input_data(initial_condition_id, e_initial_condition_mem);
|
||||
|
||||
auto outputs = network->execute();
|
||||
ASSERT_EQ(outputs.size(), 1);
|
||||
|
||||
auto expected_num_iterations = (exit_value + 1);
|
||||
|
||||
auto num_iter_mem = network->get_output_memory(actual_iteration_count_id);
|
||||
if (num_iter_mem != nullptr) {
|
||||
mem_lock<int64_t> num_iter_ptr{ num_iter_mem, get_test_stream() };
|
||||
ASSERT_EQ(num_iter_ptr.data()[0], expected_num_iterations);
|
||||
}
|
||||
|
||||
std::vector<float> expected(input_data.size());
|
||||
for (size_t j = 0; j < input_data.size(); j++) {
|
||||
auto val = static_cast<size_t>(j / 4) + 1;
|
||||
expected[j] = static_cast<float>(input_data[j] + val) + static_cast<float>(input_data[j] * val);
|
||||
}
|
||||
|
||||
auto output_mem = outputs.begin()->second.get_memory();
|
||||
auto output_layout = output_mem->get_layout();
|
||||
|
||||
ASSERT_EQ(output_layout.batch(), 1);
|
||||
ASSERT_EQ(output_layout.feature(), 1);
|
||||
ASSERT_EQ(output_layout.spatial(0), 4);
|
||||
ASSERT_EQ(output_layout.spatial(1), expected_num_iterations);
|
||||
// value check
|
||||
{
|
||||
mem_lock<float> output_ptr{ output_mem, get_test_stream() };
|
||||
for (size_t i = 0, iend = output_layout.count(); i < iend; ++i) {
|
||||
ASSERT_FLOAT_EQ(output_ptr[i], expected.at(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(loop_gpu, support_dynamic_tensoriterator) {
|
||||
test_loop_gpu_wo_trip_count(false);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user