[GPU] Skip reorder at runtime if data type and format are not changed (#18859)

* Skip reorder at runtime if data type and format are not changedt

* Update shape of reorder user at predecessor node so that we can allocate pred nodes' output to host mem if needed

* Reinterpret reorder memory at runtime if needed
(e.g., input is fake-aligned fc and reorder uses that memory)

* Add debug config

* Fix CI test failure

* Do not skip after optimized reshape

* Do not skip user reorder if the user reorder is output and current node is static, and the memory is allocated to device

* Disable skip reorder user if current node has fused node

* Update src/plugins/intel_gpu/src/graph/include/reorder_inst.h

Co-authored-by: Eddy Kim <eddy.kim@intel.com>

* Minor fix for compilation error

* Do not skip reorder if the reorder's user is optimizable concat

* Fix CI failures

* No need to wait for input_layout because the events is already resolved in dgpu

* Fixed corner case where only some of the multiple output layouts are static

---------

Co-authored-by: Eddy Kim <eddy.kim@intel.com>
This commit is contained in:
Taylor Yeonbok Lee 2023-08-02 20:59:52 -07:00 committed by GitHub
parent f3bafef128
commit 31e46ea255
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 114 additions and 4 deletions

View File

@ -125,6 +125,7 @@ public:
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
int disable_memory_reuse; // Disable memmory reuse among layers int disable_memory_reuse; // Disable memmory reuse among layers
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
int disable_runtime_skip_reorder; // Disable runtime skip reorder
std::set<int64_t> dump_iteration; // Dump n-th execution of network. std::set<int64_t> dump_iteration; // Dump n-th execution of network.
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
static const debug_configuration *get_instance(); static const debug_configuration *get_instance();

View File

@ -210,6 +210,7 @@ public:
void set_shape_change() { _shape_changed = true; } void set_shape_change() { _shape_changed = true; }
void build_deps(); void build_deps();
void do_runtime_skip_reorder();
void do_runtime_in_place_concat(); void do_runtime_in_place_concat();
void configure_shape_of_dependencies(); void configure_shape_of_dependencies();

View File

@ -86,7 +86,13 @@ public:
bool has_mean() const { return !get_typed_desc<reorder>()->mean.empty(); } bool has_mean() const { return !get_typed_desc<reorder>()->mean.empty(); }
void update_output_memory() override; void update_output_memory() override;
bool requires_reinterpret() const { return _req_reinterpr; } bool requires_reinterpret() const {
auto req_reinterpr = _req_reinterpr;
if (input_memory().get_layout() != _impl_params->get_output_layout()) {
req_reinterpr = true;
}
return req_reinterpr;
}
void save(cldnn::BinaryOutputBuffer& ob) const override; void save(cldnn::BinaryOutputBuffer& ob) const override;
void load(cldnn::BinaryInputBuffer& ib) override; void load(cldnn::BinaryInputBuffer& ib) override;

View File

@ -15,6 +15,7 @@
#include "permute_inst.h" #include "permute_inst.h"
#include "resample_inst.h" #include "resample_inst.h"
#include "reshape_inst.h" #include "reshape_inst.h"
#include "reorder_inst.h"
#include "eltwise_inst.h" #include "eltwise_inst.h"
#include "deconvolution_inst.h" #include "deconvolution_inst.h"
#include "shape_of_inst.h" #include "shape_of_inst.h"
@ -644,13 +645,54 @@ bool primitive_inst::update_impl() {
return true; return true;
} }
void primitive_inst::do_runtime_skip_reorder() {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
return;
}
if (can_be_optimized())
return;
if (_impl_params->fused_desc.size() > 0)
return;
// set successive reorder can_be_optimized if layouts are same
for (auto u : get_user_insts()) {
if (u->get_node().is_type<reorder>()) {
if (is_input() && u->is_output())
continue;
// TODO: Skipped reorder + in_place concat is not supported yet. To support later.
if (u->get_users().size() == 1 && u->get_users().front()->is_type<concatenation>() && u->get_users().front()->can_be_optimized())
continue;
auto out_port_idx = u->get_node().get_dependency_with_port(0).second;
// If current node's output_node is not dynamic, the memory is already allocated at build time
auto alloc_type = allocation_type::unknown;
if (!get_node().is_dynamic_output_layout(out_port_idx) && static_cast<int64_t>(_outputs.size()) > out_port_idx) {
alloc_type = _outputs[out_port_idx]->get_allocation_type();
}
if (alloc_type == allocation_type::usm_device && u->is_output())
continue;
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] update shape for user " << u->id() << std::endl;
u->update_shape();
u->update_shape_done_by_other = true;
if (u->_impl_params->get_input_layout() == u->_impl_params->get_output_layout()) {
u->set_can_be_optimized(true);
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] set user " << u->id() << " as can_be_optimized" << std::endl;
} else {
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] user " << u->id() << " cannot be optimized" << std::endl;
}
}
}
}
void primitive_inst::do_runtime_in_place_concat() { void primitive_inst::do_runtime_in_place_concat() {
GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) { GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
return; return;
} }
if (update_shape_done_by_other) if (update_shape_done_by_other) {
return; return;
}
if (get_users().size() != 1) return; if (get_users().size() != 1) return;
auto concat_inst = _network.get_primitive(get_users().front()->id()); auto concat_inst = _network.get_primitive(get_users().front()->id());
@ -720,6 +762,11 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
do_runtime_in_place_concat(); do_runtime_in_place_concat();
OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null"); OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
update_shape(); update_shape();
// Check successor reorder if layouts are same
// Need to set can_be_optimized for user reorder at predesescor because
// if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
do_runtime_skip_reorder();
if (_impl_params->output_layouts[0].count() == 0) { if (_impl_params->output_layouts[0].count() == 0) {
GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl; GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
auto ev = get_network().get_stream().create_user_event(true); auto ev = get_network().get_stream().create_user_event(true);
@ -792,6 +839,8 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) { if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
dependencies.reserve(dependencies.size() + _exec_deps.size()); dependencies.reserve(dependencies.size() + _exec_deps.size());
for (auto& input : _exec_deps) { for (auto& input : _exec_deps) {
if (input->is_input() && queue_type != QueueTypes::out_of_order)
continue;
auto id = input->id(); auto id = input->id();
try { try {
// if the requested event does not exists it means that it has not been executed, so the processing_order is // if the requested event does not exists it means that it has not been executed, so the processing_order is

View File

@ -189,8 +189,8 @@ debug_configuration::debug_configuration()
, disable_async_compilation(0) , disable_async_compilation(0)
, disable_dynamic_impl(0) , disable_dynamic_impl(0)
, disable_runtime_buffer_fusing(0) , disable_runtime_buffer_fusing(0)
, disable_memory_reuse(0) , disable_build_time_weight_reorder_for_dynamic_nodes(0)
, disable_build_time_weight_reorder_for_dynamic_nodes(0) { , disable_runtime_skip_reorder(0) {
#ifdef GPU_DEBUG_CONFIG #ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help); get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose); get_common_debug_env_var("Verbose", verbose);
@ -226,6 +226,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing); get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse); get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes); get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
std::string dump_iteration_str; std::string dump_iteration_str;
get_gpu_debug_env_var("DumpIteration", dump_iteration_str); get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
std::string mem_preallocation_params_str; std::string mem_preallocation_params_str;

View File

@ -0,0 +1,52 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/reorder.hpp>
#include <intel_gpu/primitives/data.hpp>
#include <intel_gpu/primitives/fully_connected.hpp>
#include "program_wrapper.h"
#include <cmath>
#include <algorithm>
using namespace cldnn;
using namespace ::tests;
namespace skip_reorder_tests {
TEST(remove_redundant_reorder, skip_reorder_at_runtime) {
auto& engine = get_test_engine();
auto weight_mem = engine.allocate_memory({{2, 32}, data_types::f32, format::bfyx});
std::vector<float> weight_data(weight_mem->get_layout().count());
std::iota(weight_data.begin(), weight_data.end(), 1.0f);
set_values(weight_mem, weight_data);
auto input_l = layout{ov::PartialShape::dynamic(2), data_types::f32, format::bfyx};
topology topology(input_layout("input", input_l),
data("weight", weight_mem),
fully_connected("fc", input_info("input"), {"weight"}, "", data_types::f32),
reorder("reorder", input_info("fc"), format::bfyx, data_types::f32)); /*output padding*/
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
auto reorder_inst = network.get_primitive("reorder");
ASSERT_EQ(reorder_inst->can_be_optimized(), false);
auto input_mem = engine.allocate_memory({{10, 32}, data_types::f32, format::bfyx});
std::vector<float> input_data(input_mem->get_layout().count());
std::iota(input_data.begin(), input_data.end(), 0.5f);
set_values(input_mem, input_data);
network.set_input_data("input", input_mem);
network.execute();
ASSERT_EQ(reorder_inst->can_be_optimized(), true);
ASSERT_EQ(network.get_output_memory("reorder")->buffer_ptr(), network.get_primitive("fc")->output_memory_ptr()->buffer_ptr());
}
} // memory_realloc_tests