[GPU] Skip reorder at runtime if data type and format are not changed (#18859)

* Skip reorder at runtime if data type and format are not changedt * Update shape of reorder user at predecessor node so that we can allocate pred nodes' output to host mem if needed * Reinterpret reorder memory at runtime if needed (e.g., input is fake-aligned fc and reorder uses that memory) * Add debug config * Fix CI test failure * Do not skip after optimized reshape * Do not skip user reorder if the user reorder is output and current node is static, and the memory is allocated to device * Disable skip reorder user if current node has fused node * Update src/plugins/intel_gpu/src/graph/include/reorder_inst.h Co-authored-by: Eddy Kim <eddy.kim@intel.com> * Minor fix for compilation error * Do not skip reorder if the reorder's user is optimizable concat * Fix CI failures * No need to wait for input_layout because the events is already resolved in dgpu * Fixed corner case where only some of the multiple output layouts are static --------- Co-authored-by: Eddy Kim <eddy.kim@intel.com>
2023-08-02 20:59:52 -07:00 · 2023-08-02 20:59:52 -07:00 · 31e46ea255
commit 31e46ea255
parent f3bafef128
6 changed files with 114 additions and 4 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -125,6 +125,7 @@ public:
    int disable_runtime_buffer_fusing;              // Disable runtime buffer fusing
    int disable_memory_reuse;                       // Disable memmory reuse among layers
    int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
    int disable_runtime_skip_reorder;               // Disable runtime skip reorder
    std::set<int64_t> dump_iteration;               // Dump n-th execution of network.
    std::vector<std::string> load_layers_raw_dump;  // List of layers to load dumped raw binary and filenames
    static const debug_configuration *get_instance();
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -210,6 +210,7 @@ public:
    void set_shape_change() { _shape_changed = true; }
    void build_deps();
    void do_runtime_skip_reorder();
    void do_runtime_in_place_concat();
    void configure_shape_of_dependencies();
--- a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h
@ -86,7 +86,13 @@ public:
    bool has_mean() const { return !get_typed_desc<reorder>()->mean.empty(); }
    void update_output_memory() override;
-    bool requires_reinterpret() const { return _req_reinterpr; }
+    bool requires_reinterpret() const {
        auto req_reinterpr = _req_reinterpr;
        if (input_memory().get_layout() != _impl_params->get_output_layout()) {
            req_reinterpr = true;
        }
        return req_reinterpr;
    }
    void save(cldnn::BinaryOutputBuffer& ob) const override;
    void load(cldnn::BinaryInputBuffer& ib) override;
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -15,6 +15,7 @@
 #include "permute_inst.h"
 #include "resample_inst.h"
 #include "reshape_inst.h"
 #include "reorder_inst.h"
 #include "eltwise_inst.h"
 #include "deconvolution_inst.h"
 #include "shape_of_inst.h"
@ -644,13 +645,54 @@ bool primitive_inst::update_impl() {
    return true;
 }
 void primitive_inst::do_runtime_skip_reorder() {
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->disable_runtime_skip_reorder) {
        return;
    }
    if (can_be_optimized())
        return;
    if (_impl_params->fused_desc.size() > 0)
        return;
    // set successive reorder can_be_optimized if layouts are same
    for (auto u : get_user_insts()) {
        if (u->get_node().is_type<reorder>()) {
            if (is_input() && u->is_output())
                continue;
            // TODO: Skipped reorder + in_place concat is not supported yet. To support later.
            if (u->get_users().size() == 1 && u->get_users().front()->is_type<concatenation>() && u->get_users().front()->can_be_optimized())
                continue;
            auto out_port_idx = u->get_node().get_dependency_with_port(0).second;
            // If current node's output_node is not dynamic, the memory is already allocated at build time
            auto alloc_type = allocation_type::unknown;
            if (!get_node().is_dynamic_output_layout(out_port_idx) && static_cast<int64_t>(_outputs.size()) > out_port_idx) {
                alloc_type = _outputs[out_port_idx]->get_allocation_type();
            }
            if (alloc_type == allocation_type::usm_device && u->is_output())
                continue;
            GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] update shape for user " << u->id() << std::endl;
            u->update_shape();
            u->update_shape_done_by_other = true;
            if (u->_impl_params->get_input_layout() == u->_impl_params->get_output_layout()) {
                u->set_can_be_optimized(true);
                GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] set user " << u->id() << " as  can_be_optimized" << std::endl;
            } else {
                GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] user " << u->id() << " cannot be optimized" << std::endl;
            }
        }
    }
 }
 void primitive_inst::do_runtime_in_place_concat() {
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->disable_runtime_buffer_fusing) {
        return;
    }
-    if (update_shape_done_by_other)
+    if (update_shape_done_by_other) {
        return;
    }
    if (get_users().size() != 1) return;
    auto concat_inst = _network.get_primitive(get_users().front()->id());
@ -720,6 +762,11 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
        do_runtime_in_place_concat();
        OPENVINO_ASSERT(_node != nullptr, "[GPU] Invalid primitive_inst object for dynamic shapes case: program_node can't be null");
        update_shape();
        // Check successor reorder if layouts are same
        // Need to set can_be_optimized for user reorder at predesescor because
        // if the user is can_be_optimized and output node then current nodes' output should be allocated to host.
        do_runtime_skip_reorder();
        if (_impl_params->output_layouts[0].count() == 0) {
            GPU_DEBUG_TRACE_DETAIL << id() << " : Skipping becuase output data is empty " << std::endl;
            auto ev = get_network().get_stream().create_user_event(true);
@ -792,6 +839,8 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
        if (queue_type == QueueTypes::out_of_order || _impl->is_cpu() || (can_be_optimized() && needs_completion_event() && !is_output())) {
            dependencies.reserve(dependencies.size() + _exec_deps.size());
            for (auto& input : _exec_deps) {
                if (input->is_input() && queue_type != QueueTypes::out_of_order)
                    continue;
                auto id = input->id();
                try {
                    // if the requested event does not exists it means that it has not been executed, so the processing_order is
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -189,8 +189,8 @@ debug_configuration::debug_configuration()
        , disable_async_compilation(0)
        , disable_dynamic_impl(0)
        , disable_runtime_buffer_fusing(0)
-        , disable_memory_reuse(0)
+        , disable_build_time_weight_reorder_for_dynamic_nodes(0)
-        , disable_build_time_weight_reorder_for_dynamic_nodes(0) {
+        , disable_runtime_skip_reorder(0) {
 #ifdef GPU_DEBUG_CONFIG
    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
@ -226,6 +226,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
    get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
    get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
    get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
    std::string dump_iteration_str;
    get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
    std::string mem_preallocation_params_str;
--- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp
+++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_redundant_reorder_at_runtime.cpp
@ -0,0 +1,52 @@
 // Copyright (C) 2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "test_utils.h"
 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/reorder.hpp>
 #include <intel_gpu/primitives/data.hpp>
 #include <intel_gpu/primitives/fully_connected.hpp>
 #include "program_wrapper.h"
 #include <cmath>
 #include <algorithm>
 using namespace cldnn;
 using namespace ::tests;
 namespace skip_reorder_tests {
 TEST(remove_redundant_reorder, skip_reorder_at_runtime) {
    auto& engine = get_test_engine();
    auto weight_mem = engine.allocate_memory({{2, 32}, data_types::f32, format::bfyx});
    std::vector<float> weight_data(weight_mem->get_layout().count());
    std::iota(weight_data.begin(), weight_data.end(), 1.0f);
    set_values(weight_mem, weight_data);
    auto input_l = layout{ov::PartialShape::dynamic(2), data_types::f32, format::bfyx};
    topology topology(input_layout("input", input_l),
                      data("weight", weight_mem),
                      fully_connected("fc", input_info("input"), {"weight"}, "", data_types::f32),
                      reorder("reorder", input_info("fc"), format::bfyx, data_types::f32)); /*output padding*/
    ExecutionConfig config = get_test_default_config(engine);
    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
    network network(engine, topology, config);
    auto reorder_inst = network.get_primitive("reorder");
    ASSERT_EQ(reorder_inst->can_be_optimized(), false);
    auto input_mem = engine.allocate_memory({{10, 32}, data_types::f32, format::bfyx});
    std::vector<float> input_data(input_mem->get_layout().count());
    std::iota(input_data.begin(), input_data.end(), 0.5f);
    set_values(input_mem, input_data);
    network.set_input_data("input", input_mem);
    network.execute();
    ASSERT_EQ(reorder_inst->can_be_optimized(), true);
    ASSERT_EQ(network.get_output_memory("reorder")->buffer_ptr(), network.get_primitive("fc")->output_memory_ptr()->buffer_ptr());
 }
 }  // memory_realloc_tests