[GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels (#20595)

* [GPU] Add ITT counters for primitive inst

* [GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels
This commit is contained in:
Sergey Shlyapnikov 2023-10-24 18:49:41 +04:00 committed by GitHub
parent 20bd58759a
commit 26cab79988
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -242,6 +242,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si
}
void primitive_inst::update_shape() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape: " + id()));
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
if (update_shape_done_by_other) {
update_shape_done_by_other = false; // reset
@ -341,6 +342,7 @@ void primitive_inst::update_shape() {
}
if (has_runtime_deps) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape_sync: " + id()));
if (!dependencies_events.empty() && queue_type == QueueTypes::out_of_order) {
_network.get_stream().wait_for_events(dependencies_events);
} else if (queue_type == QueueTypes::in_order) {
@ -380,6 +382,7 @@ void primitive_inst::update_shape() {
}
event::ptr primitive_inst::realloc_if_needed() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
@ -493,6 +496,7 @@ bool primitive_inst::use_async_compilation() {
}
bool primitive_inst::update_impl() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_impl: " + id()));
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
@ -656,6 +660,7 @@ bool primitive_inst::update_impl() {
}
void primitive_inst::do_runtime_skip_reorder() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_reorder: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
return;
@ -713,6 +718,7 @@ void primitive_inst::do_runtime_skip_reorder() {
}
void primitive_inst::do_runtime_in_place_concat() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_in_place_concat: " + id()));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
return;
@ -780,6 +786,7 @@ bool primitive_inst::has_inner_networks() const {
}
event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));
const auto primitive_id = id();
OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
GPU_DEBUG_GET_INSTANCE(debug_config);
@ -802,6 +809,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
}
if (!is_valid_fusion()) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
auto subgraph = get_unfused_subgraph();
for (auto& d : _deps) {
@ -859,16 +867,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", "
<< "can_be_optimized=" << can_be_optimized() << ")" << std::endl;
const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order;
if (_exec_deps.empty() && dependencies.empty()) {
dependencies = events;
} else {
auto queue_type = get_network().get_stream().get_queue_type();
// Prepare dependencies events in case of OOO queue, CPU implementation,
// or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
if (out_of_order_queue || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
dependencies.reserve(dependencies.size() + _exec_deps.size());
for (auto& input : _exec_deps) {
if (input->is_input() && queue_type != QueueTypes::out_of_order)
if (input->is_input() && !out_of_order_queue)
continue;
auto id = input->id();
try {
@ -883,6 +891,13 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
}
}
// Replace multiple events with single grouped event in case of barriers synchronization to prevent `_last_barrier_ev` usage as a dependency
// event of optimized_out instance's users, which may lead to unwanted extra synchronization of CPU impls with GPU kernels
if (_node && _node->is_in_shape_of_subgraph() && can_be_optimized() && dependencies.size() > 1 && out_of_order_queue) {
auto grouped_ev = get_network().get_stream().group_events(dependencies);
dependencies = {grouped_ev};
}
{
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
auto ev = _impl->execute(dependencies, *this);
@ -905,6 +920,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
}
void primitive_inst::set_arguments() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("set_arguments: " + id()));
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::set_arguments);
OPENVINO_ASSERT(_has_valid_input, id(), " has invalid/unset input");
_impl->set_arguments(*this);
@ -1138,6 +1154,7 @@ void primitive_inst::allocate_internal_buffers(bool reset) {
}
event::ptr primitive_inst::update_weights() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_weights: " + id()));
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_weights);
if (!_impl)
return nullptr;