[GPU] Allocate reorder & permute's output memory for bounded dynamic shape (#18793)

* Previously reorder / permute was not allocating its memory at build time thought the shape has upper bound * Update src/plugins/intel_gpu/src/graph/permute.cpp Co-authored-by: Sergey Shlyapnikov <Sergeishlyapnikov@gmail.com> * Fix as review comment --------- Co-authored-by: Sergey Shlyapnikov <Sergeishlyapnikov@gmail.com>
2023-07-26 17:08:58 -07:00 · 2023-07-26 17:08:58 -07:00 · c145d8f1e9
commit c145d8f1e9
parent 2afcd950fa
4 changed files with 57 additions and 7 deletions
--- a/src/plugins/intel_gpu/src/graph/permute.cpp
+++ b/src/plugins/intel_gpu/src/graph/permute.cpp
@ -108,7 +108,8 @@ std::string permute_inst::to_string(permute_node const& node) {
 }

 permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
-        parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false) {
+        parent(network, node, !node.can_be_optimized()
+                              && (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound())) {
    auto permute_order = argument->permute_order;

    auto required_order_values_size = static_cast<uint32_t>(permute_order.size());
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -1147,13 +1147,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
            allocation_type type, bool reusable_across_network, bool reset = true, memory* curr_memory = nullptr) {
        OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
        // Use layout with max tensor for dynamic shape with upper bound
-        auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
        if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
            if (curr_memory != nullptr)
                pool.release_memory(curr_memory, id, net_id);
-            return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable_across_network, reset);
+            return pool.get_memory(layout, id, net_id, dependencies, type, reusable_across_network, reset);
        }
-        return pool.get_memory(static_layout, type, reset);
+        return pool.get_memory(layout, type, reset);
    };

    auto layout = impl_params.get_output_layout(idx);
@ -1167,6 +1166,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
        return a;
    };

+    layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
    bool usm_device_allocatable = true;
    const auto& total_device_input_mem_size = std::accumulate(impl_params.input_layouts.begin(), impl_params.input_layouts.end(), (uint64_t)0, device_mem_acc);
    if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
--- a/src/plugins/intel_gpu/src/graph/reorder.cpp
+++ b/src/plugins/intel_gpu/src/graph/reorder.cpp
@ -209,9 +209,10 @@ reorder_inst::typed_primitive_inst(network& network) : parent(network) {
    _type = reorder::type_id();
 }

-reorder_inst::typed_primitive_inst(network& network, reorder_node const& node)
-    : parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false)
-    , _req_reinterpr(node.requires_reinterpret()) {
+reorder_inst::typed_primitive_inst(network& network, reorder_node const& node) :
+        parent(network, node, !node.can_be_optimized()
+                              && (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound()))
+        , _req_reinterpr(node.requires_reinterpret()) {
    if (node.can_be_optimized())
        reuse_input();

--- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/bounded_shape_mem_alloc.cpp
+++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/bounded_shape_mem_alloc.cpp
@ -0,0 +1,48 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
+#include <intel_gpu/primitives/data.hpp>
+
+#include "program_wrapper.h"
+
+#include <cmath>
+#include <algorithm>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace bounded_shape_mem_alloc_tests {
+
+TEST(dyn_shape_bounded_shape_mem, reorder) {
+    auto& engine = get_test_engine();
+    auto input_lay = layout{ov::PartialShape{ov::Dimension(1, 10), ov::Dimension(1, 20)}, data_types::f32, format::bfyx};
+    topology topology(input_layout("input", input_lay),
+                      reorder("reorder", input_info("input"), format::bfyx, data_types::f16));
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+
+    const auto reorder_mem = network.get_primitive("reorder")->output_memory_ptr();
+    ASSERT_NE(reorder_mem, nullptr);
+    ASSERT_EQ(reorder_mem->get_layout().count(), 10*20);
+}
+
+TEST(dyn_shape_bounded_shape_mem, permute) {
+    auto& engine = get_test_engine();
+    auto input_lay = layout{ov::PartialShape{ov::Dimension(1, 10), ov::Dimension(1,15), ov::Dimension(1,5), ov::Dimension(1, 20)}, data_types::f32, format::bfyx};
+    topology topology(input_layout("input", input_lay),
+                      permute("permute", input_info("input"), {0, 2, 3, 1}));
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+
+    const auto permute_mem = network.get_primitive("permute")->output_memory_ptr();
+    ASSERT_NE(permute_mem, nullptr);
+    ASSERT_EQ(permute_mem->get_layout().count(), 10*15*5*20);
+}
+}