[GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels (#20595)

* [GPU] Add ITT counters for primitive inst * [GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels
2023-10-24 18:49:41 +04:00 · 2023-10-24 18:49:41 +04:00 · 26cab79988
commit 26cab79988
parent 20bd58759a
1 changed files with 20 additions and 3 deletions
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -242,6 +242,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si
 }

 void primitive_inst::update_shape() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape: " + id()));
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
    if (update_shape_done_by_other) {
        update_shape_done_by_other = false; // reset
@ -341,6 +342,7 @@ void primitive_inst::update_shape() {
    }

    if (has_runtime_deps) {
+        OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape_sync: " + id()));
        if (!dependencies_events.empty() && queue_type == QueueTypes::out_of_order) {
            _network.get_stream().wait_for_events(dependencies_events);
        } else if (queue_type == QueueTypes::in_order) {
@ -380,6 +382,7 @@ void primitive_inst::update_shape() {
 }

 event::ptr primitive_inst::realloc_if_needed() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id()));
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);

@ -493,6 +496,7 @@ bool primitive_inst::use_async_compilation() {
 }

 bool primitive_inst::update_impl() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_impl: " + id()));
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
    auto prev_impl_str =  _impl != nullptr ? _impl->get_kernel_name() : "nullptr";

@ -656,6 +660,7 @@ bool primitive_inst::update_impl() {
 }

 void primitive_inst::do_runtime_skip_reorder() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_reorder: " + id()));
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
        return;
@ -713,6 +718,7 @@ void primitive_inst::do_runtime_skip_reorder() {
 }

 void primitive_inst::do_runtime_in_place_concat() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_in_place_concat: " + id()));
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
        return;
@ -780,6 +786,7 @@ bool primitive_inst::has_inner_networks() const {
 }

 event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));
    const auto primitive_id = id();
    OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
    GPU_DEBUG_GET_INSTANCE(debug_config);
@ -802,6 +809,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
        }

        if (!is_valid_fusion()) {
+            OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
            auto subgraph = get_unfused_subgraph();

            for (auto& d : _deps) {
@ -859,16 +867,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
    GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", "
                    << "can_be_optimized=" << can_be_optimized() << ")" << std::endl;

+    const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order;
    if (_exec_deps.empty() && dependencies.empty()) {
        dependencies = events;
    } else {
-        auto queue_type = get_network().get_stream().get_queue_type();
        // Prepare dependencies events in case of OOO queue, CPU implementation,
        // or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
-        if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
+        if (out_of_order_queue || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
            dependencies.reserve(dependencies.size() + _exec_deps.size());
            for (auto& input : _exec_deps) {
-                if (input->is_input() && queue_type != QueueTypes::out_of_order)
+                if (input->is_input() && !out_of_order_queue)
                    continue;
                auto id = input->id();
                try {
@ -883,6 +891,13 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
        }
    }

+    // Replace multiple events with single grouped event in case of barriers synchronization to prevent `_last_barrier_ev` usage as a dependency
+    // event of optimized_out instance's users, which may lead to unwanted extra synchronization of CPU impls with GPU kernels
+    if (_node && _node->is_in_shape_of_subgraph() && can_be_optimized() && dependencies.size() > 1 && out_of_order_queue) {
+        auto grouped_ev = get_network().get_stream().group_events(dependencies);
+        dependencies = {grouped_ev};
+    }
+
    {
        GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
        auto ev = _impl->execute(dependencies, *this);
@ -905,6 +920,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
 }

 void primitive_inst::set_arguments() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("set_arguments: " + id()));
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::set_arguments);
    OPENVINO_ASSERT(_has_valid_input, id(), " has invalid/unset input");
    _impl->set_arguments(*this);
@ -1138,6 +1154,7 @@ void primitive_inst::allocate_internal_buffers(bool reset) {
 }

 event::ptr primitive_inst::update_weights() {
+    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_weights: " + id()));
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_weights);
    if (!_impl)
        return nullptr;