[GPU] Skip reorder at runtime if data type and format are not changed (#18859)
* Skip reorder at runtime if data type and format are not changedt * Update shape of reorder user at predecessor node so that we can allocate pred nodes' output to host mem if needed * Reinterpret reorder memory at runtime if needed (e.g., input is fake-aligned fc and reorder uses that memory) * Add debug config * Fix CI test failure * Do not skip after optimized reshape * Do not skip user reorder if the user reorder is output and current node is static, and the memory is allocated to device * Disable skip reorder user if current node has fused node * Update src/plugins/intel_gpu/src/graph/include/reorder_inst.h Co-authored-by: Eddy Kim <eddy.kim@intel.com> * Minor fix for compilation error * Do not skip reorder if the reorder's user is optimizable concat * Fix CI failures * No need to wait for input_layout because the events is already resolved in dgpu * Fixed corner case where only some of the multiple output layouts are static --------- Co-authored-by: Eddy Kim <eddy.kim@intel.com>
This commit is contained in:
parent
f3bafef128
commit
31e46ea255
@ -125,6 +125,7 @@ public:
|
|||||||
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
|
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
|
||||||
int disable_memory_reuse; // Disable memmory reuse among layers
|
int disable_memory_reuse; // Disable memmory reuse among layers
|
||||||
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
|
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
|
||||||
|
int disable_runtime_skip_reorder; // Disable runtime skip reorder
|
||||||
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
|
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
|
||||||
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
|
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
|
||||||
static const debug_configuration *get_instance();
|
static const debug_configuration *get_instance();
|
||||||
|
@ -210,6 +210,7 @@ public:
|
|||||||
void set_shape_change() { _shape_changed = true; }
|
void set_shape_change() { _shape_changed = true; }
|
||||||
|
|
||||||
void build_deps();
|
void build_deps();
|
||||||
|
void do_runtime_skip_reorder();
|
||||||
void do_runtime_in_place_concat();
|
void do_runtime_in_place_concat();
|
||||||
void configure_shape_of_dependencies();
|
void configure_shape_of_dependencies();
|
||||||
|
|
||||||
|
@ -86,7 +86,13 @@ public:
|
|||||||
bool has_mean() const { return !get_typed_desc<reorder>()->mean.empty(); }
|
bool has_mean() const { return !get_typed_desc<reorder>()->mean.empty(); }
|
||||||
|
|
||||||
void update_output_memory() override;
|
void update_output_memory() override;
|
||||||
bool requires_reinterpret() const { return _req_reinterpr; }
|
bool requires_reinterpret() const {
|
||||||
|
auto req_reinterpr = _req_reinterpr;
|
||||||
|
if (input_memory().get_layout() != _impl_params->get_output_layout()) {
|
||||||
|
req_reinterpr = true;
|
||||||
|
}
|
||||||
|
return req_reinterpr;
|
||||||
|
}
|
||||||
|
|
||||||
void save(cldnn::BinaryOutputBuffer& ob) const override;
|
void save(cldnn::BinaryOutputBuffer& ob) const override;
|
||||||
void load(cldnn::BinaryInputBuffer& ib) override;
|
void load(cldnn::BinaryInputBuffer& ib) override;
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include "permute_inst.h"
|
#include "permute_inst.h"
|
||||||
#include "resample_inst.h"
|
#include "resample_inst.h"
|
||||||
#include "reshape_inst.h"
|
#include "reshape_inst.h"
|
||||||
|
#include "reorder_inst.h"
|
||||||
#include "eltwise_inst.h"
|
#include "eltwise_inst.h"
|
||||||
#include "deconvolution_inst.h"
|
#include "deconvolution_inst.h"
|
||||||
#include "shape_of_inst.h"
|
#include "shape_of_inst.h"
|
||||||
@ -644,13 +645,54 @@ bool primitive_inst::update_impl() {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void primitive_inst::do_runtime_skip_reorder() {
|
||||||
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
|
GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (can_be_optimized())
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (_impl_params->fused_desc.size() > 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// set successive reorder can_be_optimized if layouts are same
|
||||||
|
for (auto u : get_user_insts()) {
|
||||||
|
if (u->get_node().is_type<reorder>()) {
|
||||||
|
if (is_input() && u->is_output())
|
||||||
|
continue;
|
||||||
|
// TODO: Skipped reorder + in_place concat is not supported yet. To support later.
|
||||||
|
if (u->get_users().size() == 1 && u->get_users().front()->is_type<concatenation>() && u->get_users().front()->can_be_optimized())
|
||||||
|
continue;
|
||||||
|
auto out_port_idx = u->get_node().get_dependency_with_port(0).second;
|
||||||
|
// If current node's output_node is not dynamic, the memory is already allocated at build time
|
||||||
|
auto alloc_type = allocation_type::unknown;
|
||||||
|
if (!get_node().is_dynamic_output_layout(out_port_idx) && static_cast<int64_t>(_outputs.size()) > out_port_idx) {
|
||||||
|
alloc_type = _outputs[out_port_idx]->get_allocation_type();
|
||||||
|
}
|
||||||
|
if (alloc_type == allocation_type::usm_device && u->is_output())
|
||||||
|
continue;
|
||||||
|
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] update shape for user " << u->id() << std::endl;
|
||||||
|
u->update_shape();
|
||||||
|
u->update_shape_done_by_other = true;
|
||||||
|
if (u->_impl_params->get_input_layout() == u->_impl_params->get_output_layout()) {
|
||||||
|
u->set_can_be_optimized(true);
|
||||||
|
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] set user " << u->id() << " as can_be_optimized" << std::endl;
|
||||||
|
} else {
|
||||||
|
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] user " << u->id() << " cannot be optimized" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void primitive_inst::do_runtime_in_place_concat() {
|
void primitive_inst::do_runtime_in_place_concat() {
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
|
GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (update_shape_done_by_other)
|
if (update_shape_done_by_other) {
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
if (get_users().size() != 1) return;
|
if (get_users().size() != 1) return;
|
||||||
|
|
||||||
auto concat_inst = _network.get_primitive(get_users().front()->id());
|
auto concat_inst = _network.get_primitive(get_users().front()->id());
|
||||||
@ -720,6 +762,11 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
|||||||
do_runtime_in_place_concat();
|
do_runtime_in_place_concat();
|
||||||
OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
|
OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
|
||||||
update_shape();
|
update_shape();
|
||||||
|
|
||||||
|
// Check successor reorder if layouts are same
|
||||||
|
// Need to set can_be_optimized for user reorder at predesescor because
|
||||||
|
// if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
|
||||||
|
do_runtime_skip_reorder();
|
||||||
if (_impl_params->output_layouts[0].count() == 0) {
|
if (_impl_params->output_layouts[0].count() == 0) {
|
||||||
GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
|
GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
|
||||||
auto ev = get_network().get_stream().create_user_event(true);
|
auto ev = get_network().get_stream().create_user_event(true);
|
||||||
@ -792,6 +839,8 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
|||||||
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
|
if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
|
||||||
dependencies.reserve(dependencies.size() + _exec_deps.size());
|
dependencies.reserve(dependencies.size() + _exec_deps.size());
|
||||||
for (auto& input : _exec_deps) {
|
for (auto& input : _exec_deps) {
|
||||||
|
if (input->is_input() && queue_type != QueueTypes::out_of_order)
|
||||||
|
continue;
|
||||||
auto id = input->id();
|
auto id = input->id();
|
||||||
try {
|
try {
|
||||||
// if the requested event does not exists it means that it has not been executed, so the processing_order is
|
// if the requested event does not exists it means that it has not been executed, so the processing_order is
|
||||||
|
@ -189,8 +189,8 @@ debug_configuration::debug_configuration()
|
|||||||
, disable_async_compilation(0)
|
, disable_async_compilation(0)
|
||||||
, disable_dynamic_impl(0)
|
, disable_dynamic_impl(0)
|
||||||
, disable_runtime_buffer_fusing(0)
|
, disable_runtime_buffer_fusing(0)
|
||||||
, disable_memory_reuse(0)
|
, disable_build_time_weight_reorder_for_dynamic_nodes(0)
|
||||||
, disable_build_time_weight_reorder_for_dynamic_nodes(0) {
|
, disable_runtime_skip_reorder(0) {
|
||||||
#ifdef GPU_DEBUG_CONFIG
|
#ifdef GPU_DEBUG_CONFIG
|
||||||
get_gpu_debug_env_var("Help", help);
|
get_gpu_debug_env_var("Help", help);
|
||||||
get_common_debug_env_var("Verbose", verbose);
|
get_common_debug_env_var("Verbose", verbose);
|
||||||
@ -226,6 +226,7 @@ debug_configuration::debug_configuration()
|
|||||||
get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
|
get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
|
||||||
get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
|
get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
|
||||||
get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
|
get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
|
||||||
|
get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
|
||||||
std::string dump_iteration_str;
|
std::string dump_iteration_str;
|
||||||
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
|
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
|
||||||
std::string mem_preallocation_params_str;
|
std::string mem_preallocation_params_str;
|
||||||
|
@ -0,0 +1,52 @@
|
|||||||
|
// Copyright (C) 2023 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "test_utils.h"
|
||||||
|
|
||||||
|
#include <intel_gpu/primitives/input_layout.hpp>
|
||||||
|
#include <intel_gpu/primitives/reorder.hpp>
|
||||||
|
#include <intel_gpu/primitives/data.hpp>
|
||||||
|
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||||
|
|
||||||
|
#include "program_wrapper.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
using namespace cldnn;
|
||||||
|
using namespace ::tests;
|
||||||
|
|
||||||
|
namespace skip_reorder_tests {
|
||||||
|
TEST(remove_redundant_reorder, skip_reorder_at_runtime) {
|
||||||
|
auto& engine = get_test_engine();
|
||||||
|
|
||||||
|
auto weight_mem = engine.allocate_memory({{2, 32}, data_types::f32, format::bfyx});
|
||||||
|
std::vector<float> weight_data(weight_mem->get_layout().count());
|
||||||
|
std::iota(weight_data.begin(), weight_data.end(), 1.0f);
|
||||||
|
set_values(weight_mem, weight_data);
|
||||||
|
|
||||||
|
auto input_l = layout{ov::PartialShape::dynamic(2), data_types::f32, format::bfyx};
|
||||||
|
topology topology(input_layout("input", input_l),
|
||||||
|
data("weight", weight_mem),
|
||||||
|
fully_connected("fc", input_info("input"), {"weight"}, "", data_types::f32),
|
||||||
|
reorder("reorder", input_info("fc"), format::bfyx, data_types::f32)); /*output padding*/
|
||||||
|
|
||||||
|
ExecutionConfig config = get_test_default_config(engine);
|
||||||
|
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||||
|
|
||||||
|
network network(engine, topology, config);
|
||||||
|
auto reorder_inst = network.get_primitive("reorder");
|
||||||
|
ASSERT_EQ(reorder_inst->can_be_optimized(), false);
|
||||||
|
|
||||||
|
auto input_mem = engine.allocate_memory({{10, 32}, data_types::f32, format::bfyx});
|
||||||
|
std::vector<float> input_data(input_mem->get_layout().count());
|
||||||
|
std::iota(input_data.begin(), input_data.end(), 0.5f);
|
||||||
|
set_values(input_mem, input_data);
|
||||||
|
|
||||||
|
network.set_input_data("input", input_mem);
|
||||||
|
network.execute();
|
||||||
|
ASSERT_EQ(reorder_inst->can_be_optimized(), true);
|
||||||
|
ASSERT_EQ(network.get_output_memory("reorder")->buffer_ptr(), network.get_primitive("fc")->output_memory_ptr()->buffer_ptr());
|
||||||
|
}
|
||||||
|
} // memory_realloc_tests
|
Loading…
Reference in New Issue
Block a user