From cd703580b6082b3ed2934f938bf5c36afc2b15b8 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 30 Mar 2022 10:53:53 +0300 Subject: [PATCH] [GPU] Host time optimizations for in order queue (#11255) * [GPU] Host time optimizations * Fix failed fusings_gpu/permute_eltwise_loop.basic/* tests --- .../include/intel_gpu/graph/network.hpp | 13 +++- .../intel_gpu/src/graph/impls/common/loop.cpp | 4 +- .../impls/onednn/primitive_onednn_base.h | 8 --- src/plugins/intel_gpu/src/graph/network.cpp | 65 ++++++++++--------- .../intel_gpu/src/graph/primitive_inst.cpp | 27 ++++---- .../tests/fusions/loop_fusion_test.cpp | 2 +- 6 files changed, 63 insertions(+), 56 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index db28b1b0d9f..13d2a16b2eb 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -102,7 +102,10 @@ public: } network_output get_output(const primitive_id& output_id) { - return network_output(get_primitive_event(output_id), get_output_memory(output_id), get_stream_ptr()); + event::ptr evt; + if (get_stream().get_queue_type() == queue_types::out_of_order) + evt = get_primitive_event(output_id); + return network_output(evt, get_output_memory(output_id), get_stream_ptr()); } memory::ptr get_output_memory(const primitive_id& output_id); @@ -133,8 +136,12 @@ public: } std::map result; for (auto& id : primitive_ids) { - if (std::find(optimized_primitives.begin(), optimized_primitives.end(), id) == optimized_primitives.end()) - result.emplace(id, get_primitive_event(id)); + if (std::find(optimized_primitives.begin(), optimized_primitives.end(), id) == optimized_primitives.end()) { + if (has_event(id)) + result.emplace(id, get_primitive_event(id)); + else + result.emplace(id, nullptr); + } } return result; } diff --git a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp index d1e93d9f1ce..0a8f05cacb3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp @@ -118,7 +118,9 @@ struct loop_impl : typed_primitive_impl { loop_carried_dep.clear(); for (const auto& backedge : node.get_back_edges()) { - event::ptr body_event = body_network->get_primitive_event(backedge.from); + event::ptr body_event; + if (body_network->has_event(backedge.from)) + body_event = body_network->get_primitive_event(backedge.from); loop_carried_dep.emplace_back(body_event); } diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index 653b24bfef5..93bb7acdca6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -209,14 +209,6 @@ protected: if (profiling) { stream.finish(); event->set(); - } else { - // Create and set user event as complete - event = stream.create_user_event(true); - } - - if (!event) { - std::string error_msg = "Event was not created properly for " + instance.id(); - throw std::runtime_error(error_msg); } return event; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 4cd38a00eaa..adb84e252b4 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -700,34 +700,39 @@ void network::execute_impl(const std::vector& events) { } } - for (auto& inst : _program->get_processing_order()) { - // Special handling for mutable data. The event should be the same as the user or dependency with highest - // processing_num as the mutable_data can be updated when is both user or dependency. - if (inst->is_type()) { - decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0; - for (auto& user : inst->get_users()) { - auto user_proc_num = _program->get_processing_order().get_processing_number(user); - if (user_proc_num > proc_num) { - _events[inst->id()] = _events[user->id()]; - proc_num = user_proc_num; + // Store events only in case of OOO queue or enabled Profiling + auto store_events = get_stream().get_queue_type() == queue_types::out_of_order || + get_engine().configuration().enable_profiling; + if (store_events) { + for (auto& inst : _program->get_processing_order()) { + // Special handling for mutable data. The event should be the same as the user or dependency with highest + // processing_num as the mutable_data can be updated when is both user or dependency. + if (inst->is_type()) { + decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0; + for (auto& user : inst->get_users()) { + auto user_proc_num = _program->get_processing_order().get_processing_number(user); + if (user_proc_num > proc_num) { + _events[inst->id()] = _events[user->id()]; + proc_num = user_proc_num; + } } - } - if (!inst->get_dependencies().empty()) { - for (auto& dep : inst->get_dependencies()) { - auto dep_proc_num = _program->get_processing_order().get_processing_number(dep); - if (dep_proc_num > proc_num) { - _events[inst->id()] = _events[dep->id()]; - proc_num = dep_proc_num; + if (!inst->get_dependencies().empty()) { + for (auto& dep : inst->get_dependencies()) { + auto dep_proc_num = _program->get_processing_order().get_processing_number(dep); + if (dep_proc_num > proc_num) { + _events[inst->id()] = _events[dep->id()]; + proc_num = dep_proc_num; + } } } } } - } - for (auto& dout : _data_outputs) { // data primitives are not executed so if they are marked as output we need to add - // them valid events manually - _events[dout->id()] = get_stream().create_user_event(true); + for (auto& dout : _data_outputs) { // data primitives are not executed so if they are marked as output we need to add + // them valid events manually + _events[dout->id()] = get_stream().create_user_event(true); + } } for (auto& prim : _primitives) { @@ -828,17 +833,15 @@ std::vector> network::get_primitives(const std:: } void network::execute_primitive(const std::shared_ptr& primitive, - const std::vector& events) { - auto id = primitive->id(); - auto it = _events.find(id); - bool found = (it != _events.end()); - CLDNN_ERROR_BOOL(id, - "Invalid primitive call ", - found, - "Primitive " + id + " is tried to be executed for the second time"); - + const std::vector& events) { event::ptr ev = primitive->execute(events); - _events.insert({id, ev}); + + // Collect events only for OOO queue and Profiling mode + if (get_stream().get_queue_type() == queue_types::out_of_order || + get_engine().configuration().enable_profiling) { + auto id = primitive->id(); + _events.insert({id, ev}); + } } void network::allocate_primitive_instance(program_node const& node) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 16aaeaf0f9d..99bc55ac5bc 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -148,18 +148,21 @@ event::ptr primitive_inst::execute(const std::vector& events) { return _impl->execute(events, *this); std::vector dependencies; - dependencies.reserve(_exec_deps.size()); - for (auto& input : _exec_deps) { - auto id = input->id(); - try { - // if the requested event does not exits it means that it has not been executed, so the processing_order is - // wrong or synchronization failed. - auto ev = get_network().get_primitive_event(id); - dependencies.emplace_back(ev); - } catch (const std::out_of_range& oor) { - std::string temp = std::string("internal CLDNN error: execution order corrupted.") + std::string("\n") + - std::string(oor.what() + std::string("\n")); - CLDNN_ERROR_MESSAGE(id, temp); + auto queue_type = get_network().get_stream().get_queue_type(); + if (queue_type == queue_types::out_of_order) { + dependencies.reserve(_exec_deps.size()); + for (auto& input : _exec_deps) { + auto id = input->id(); + try { + // if the requested event does not exists it means that it has not been executed, so the processing_order is + // wrong or synchronization failed. + auto ev = get_network().get_primitive_event(id); + dependencies.emplace_back(ev); + } catch (const std::out_of_range& oor) { + std::string temp = std::string("internal CLDNN error: execution order corrupted.") + std::string("\n") + + std::string(oor.what() + std::string("\n")); + CLDNN_ERROR_MESSAGE(id, temp); + } } } return _impl->execute(dependencies, *this); diff --git a/src/plugins/intel_gpu/tests/fusions/loop_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/loop_fusion_test.cpp index 64196a1d44f..5166918641e 100644 --- a/src/plugins/intel_gpu/tests/fusions/loop_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/loop_fusion_test.cpp @@ -52,7 +52,7 @@ public: }; class permute_eltwise_loop: public LoopFusingTest {}; -TEST_P(permute_eltwise_loop, basic_taylor) { +TEST_P(permute_eltwise_loop, basic) { auto p = GetParam(); auto num_iteration_mem = engine.allocate_memory({data_types::i64, format::bfyx, {1, 1, 1, 1}}); auto trip_count_mem = engine.allocate_memory({data_types::i64, format::bfyx, {1, 1, 1, 1}});