[GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels (#20595)
* [GPU] Add ITT counters for primitive inst * [GPU] Improve OOOQ synchronization between ShapeOf subgraph's CPU impls and GPU kernels
This commit is contained in:
parent
20bd58759a
commit
26cab79988
@ -242,6 +242,7 @@ event::ptr primitive_inst::set_output_memory(memory::ptr mem_new, bool check, si
|
||||
}
|
||||
|
||||
void primitive_inst::update_shape() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape: " + id()));
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
|
||||
if (update_shape_done_by_other) {
|
||||
update_shape_done_by_other = false; // reset
|
||||
@ -341,6 +342,7 @@ void primitive_inst::update_shape() {
|
||||
}
|
||||
|
||||
if (has_runtime_deps) {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_shape_sync: " + id()));
|
||||
if (!dependencies_events.empty() && queue_type == QueueTypes::out_of_order) {
|
||||
_network.get_stream().wait_for_events(dependencies_events);
|
||||
} else if (queue_type == QueueTypes::in_order) {
|
||||
@ -380,6 +382,7 @@ void primitive_inst::update_shape() {
|
||||
}
|
||||
|
||||
event::ptr primitive_inst::realloc_if_needed() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id()));
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
|
||||
|
||||
@ -493,6 +496,7 @@ bool primitive_inst::use_async_compilation() {
|
||||
}
|
||||
|
||||
bool primitive_inst::update_impl() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_impl: " + id()));
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
|
||||
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
|
||||
@ -656,6 +660,7 @@ bool primitive_inst::update_impl() {
|
||||
}
|
||||
|
||||
void primitive_inst::do_runtime_skip_reorder() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_reorder: " + id()));
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
|
||||
return;
|
||||
@ -713,6 +718,7 @@ void primitive_inst::do_runtime_skip_reorder() {
|
||||
}
|
||||
|
||||
void primitive_inst::do_runtime_in_place_concat() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_in_place_concat: " + id()));
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
|
||||
return;
|
||||
@ -780,6 +786,7 @@ bool primitive_inst::has_inner_networks() const {
|
||||
}
|
||||
|
||||
event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("primitive_inst::execute: " + id()));
|
||||
const auto primitive_id = id();
|
||||
OPENVINO_ASSERT(_has_valid_input, primitive_id, " has invalid/unset input");
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
@ -802,6 +809,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
}
|
||||
|
||||
if (!is_valid_fusion()) {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("unfused_subgraph_exec: " + id()));
|
||||
auto subgraph = get_unfused_subgraph();
|
||||
|
||||
for (auto& d : _deps) {
|
||||
@ -859,16 +867,16 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << " (is_dynamic=" << _impl->is_dynamic() << ", "
|
||||
<< "can_be_optimized=" << can_be_optimized() << ")" << std::endl;
|
||||
|
||||
const bool out_of_order_queue = get_network().get_stream().get_queue_type() == QueueTypes::out_of_order;
|
||||
if (_exec_deps.empty() && dependencies.empty()) {
|
||||
dependencies = events;
|
||||
} else {
|
||||
auto queue_type = get_network().get_stream().get_queue_type();
|
||||
// Prepare dependencies events in case of OOO queue, CPU implementation,
|
||||
// or optimized_out impl which has CPU users (needs_completion_event() && !is_output() condition)
|
||||
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
|
||||
if (out_of_order_queue || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
|
||||
dependencies.reserve(dependencies.size() + _exec_deps.size());
|
||||
for (auto& input : _exec_deps) {
|
||||
if (input->is_input() && queue_type != QueueTypes::out_of_order)
|
||||
if (input->is_input() && !out_of_order_queue)
|
||||
continue;
|
||||
auto id = input->id();
|
||||
try {
|
||||
@ -883,6 +891,13 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
}
|
||||
}
|
||||
|
||||
// Replace multiple events with single grouped event in case of barriers synchronization to prevent `_last_barrier_ev` usage as a dependency
|
||||
// event of optimized_out instance's users, which may lead to unwanted extra synchronization of CPU impls with GPU kernels
|
||||
if (_node && _node->is_in_shape_of_subgraph() && can_be_optimized() && dependencies.size() > 1 && out_of_order_queue) {
|
||||
auto grouped_ev = get_network().get_stream().group_events(dependencies);
|
||||
dependencies = {grouped_ev};
|
||||
}
|
||||
|
||||
{
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::inference);
|
||||
auto ev = _impl->execute(dependencies, *this);
|
||||
@ -905,6 +920,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
}
|
||||
|
||||
void primitive_inst::set_arguments() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("set_arguments: " + id()));
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::set_arguments);
|
||||
OPENVINO_ASSERT(_has_valid_input, id(), " has invalid/unset input");
|
||||
_impl->set_arguments(*this);
|
||||
@ -1138,6 +1154,7 @@ void primitive_inst::allocate_internal_buffers(bool reset) {
|
||||
}
|
||||
|
||||
event::ptr primitive_inst::update_weights() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("update_weights: " + id()));
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_weights);
|
||||
if (!_impl)
|
||||
return nullptr;
|
||||
|
Loading…
Reference in New Issue
Block a user