diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index a03cb307f9a..d6365b69138 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -125,6 +125,7 @@ public: int disable_runtime_buffer_fusing; // Disable runtime buffer fusing int disable_memory_reuse; // Disable memmory reuse among layers int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes + int disable_runtime_skip_reorder; // Disable runtime skip reorder std::set dump_iteration; // Dump n-th execution of network. std::vector load_layers_raw_dump; // List of layers to load dumped raw binary and filenames static const debug_configuration *get_instance(); diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index 2d9e66b7ea6..70c3d1daa08 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -210,6 +210,7 @@ public: void set_shape_change() { _shape_changed = true; } void build_deps(); + void do_runtime_skip_reorder(); void do_runtime_in_place_concat(); void configure_shape_of_dependencies(); diff --git a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h index f04cb7e7f5b..b1637032ffb 100644 --- a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h @@ -86,7 +86,13 @@ public: bool has_mean() const { return !get_typed_desc()->mean.empty(); } void update_output_memory() override; - bool requires_reinterpret() const { return _req_reinterpr; } + bool requires_reinterpret() const { + auto req_reinterpr = _req_reinterpr; + if (input_memory().get_layout() != _impl_params->get_output_layout()) { + req_reinterpr = true; + } + return req_reinterpr; + } void save(cldnn::BinaryOutputBuffer& ob) const override; void load(cldnn::BinaryInputBuffer& ib) override; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index e3e62beca9f..2d440831fdc 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -15,6 +15,7 @@ #include "permute_inst.h" #include "resample_inst.h" #include "reshape_inst.h" +#include "reorder_inst.h" #include "eltwise_inst.h" #include "deconvolution_inst.h" #include "shape_of_inst.h" @@ -644,13 +645,54 @@ bool primitive_inst::update_impl() { return true; } +void primitive_inst::do_runtime_skip_reorder() { + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) { + return; + } + if (can_be_optimized()) + return; + + if (_impl_params->fused_desc.size() > 0) + return; + + // set successive reorder can_be_optimized if layouts are same + for (auto u : get_user_insts()) { + if (u->get_node().is_type()) { + if (is_input() && u->is_output()) + continue; + // TODO: Skipped reorder + in_place concat is not supported yet. To support later. + if (u->get_users().size() == 1 && u->get_users().front()->is_type() && u->get_users().front()->can_be_optimized()) + continue; + auto out_port_idx = u->get_node().get_dependency_with_port(0).second; + // If current node's output_node is not dynamic, the memory is already allocated at build time + auto alloc_type = allocation_type::unknown; + if (!get_node().is_dynamic_output_layout(out_port_idx) && static_cast(_outputs.size()) > out_port_idx) { + alloc_type = _outputs[out_port_idx]->get_allocation_type(); + } + if (alloc_type == allocation_type::usm_device && u->is_output()) + continue; + GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] update shape for user " << u->id() << std::endl; + u->update_shape(); + u->update_shape_done_by_other = true; + if (u->_impl_params->get_input_layout() == u->_impl_params->get_output_layout()) { + u->set_can_be_optimized(true); + GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] set user " << u->id() << " as can_be_optimized" << std::endl; + } else { + GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] user " << u->id() << " cannot be optimized" << std::endl; + } + } + } +} + void primitive_inst::do_runtime_in_place_concat() { GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) { return; } - if (update_shape_done_by_other) + if (update_shape_done_by_other) { return; + } if (get_users().size() != 1) return; auto concat_inst = _network.get_primitive(get_users().front()->id()); @@ -720,6 +762,11 @@ event::ptr primitive_inst::execute(const std::vector& events) { do_runtime_in_place_concat(); OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null"); update_shape(); + + // Check successor reorder if layouts are same + // Need to set can_be_optimized for user reorder at predesescor because + // if the user is can_be_optimized and output node then current nodes' output should be allocated to host. + do_runtime_skip_reorder(); if (_impl_params->output_layouts[0].count() == 0) { GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl; auto ev = get_network().get_stream().create_user_event(true); @@ -792,6 +839,8 @@ event::ptr primitive_inst::execute(const std::vector& events) { if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) { dependencies.reserve(dependencies.size() + _exec_deps.size()); for (auto& input : _exec_deps) { + if (input->is_input() && queue_type != QueueTypes::out_of_order) + continue; auto id = input->id(); try { // if the requested event does not exists it means that it has not been executed, so the processing_order is diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index ca678a34382..b7572722895 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -189,8 +189,8 @@ debug_configuration::debug_configuration() , disable_async_compilation(0) , disable_dynamic_impl(0) , disable_runtime_buffer_fusing(0) - , disable_memory_reuse(0) - , disable_build_time_weight_reorder_for_dynamic_nodes(0) { + , disable_build_time_weight_reorder_for_dynamic_nodes(0) + , disable_runtime_skip_reorder(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); @@ -226,6 +226,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing); get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse); get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes); + get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder); std::string dump_iteration_str; get_gpu_debug_env_var("DumpIteration", dump_iteration_str); std::string mem_preallocation_params_str; diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp new file mode 100644 index 00000000000..5ee76f2c2b2 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include +#include +#include +#include + +#include "program_wrapper.h" + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +namespace skip_reorder_tests { +TEST(remove_redundant_reorder, skip_reorder_at_runtime) { + auto& engine = get_test_engine(); + + auto weight_mem = engine.allocate_memory({{2, 32}, data_types::f32, format::bfyx}); + std::vector weight_data(weight_mem->get_layout().count()); + std::iota(weight_data.begin(), weight_data.end(), 1.0f); + set_values(weight_mem, weight_data); + + auto input_l = layout{ov::PartialShape::dynamic(2), data_types::f32, format::bfyx}; + topology topology(input_layout("input", input_l), + data("weight", weight_mem), + fully_connected("fc", input_info("input"), {"weight"}, "", data_types::f32), + reorder("reorder", input_info("fc"), format::bfyx, data_types::f32)); /*output padding*/ + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + network network(engine, topology, config); + auto reorder_inst = network.get_primitive("reorder"); + ASSERT_EQ(reorder_inst->can_be_optimized(), false); + + auto input_mem = engine.allocate_memory({{10, 32}, data_types::f32, format::bfyx}); + std::vector input_data(input_mem->get_layout().count()); + std::iota(input_data.begin(), input_data.end(), 0.5f); + set_values(input_mem, input_data); + + network.set_input_data("input", input_mem); + network.execute(); + ASSERT_EQ(reorder_inst->can_be_optimized(), true); + ASSERT_EQ(network.get_output_memory("reorder")->buffer_ptr(), network.get_primitive("fc")->output_memory_ptr()->buffer_ptr()); +} +} // memory_realloc_tests