[GPU] Allocate reorder & permute's output memory for bounded dynamic shape (#18793)
* Previously reorder / permute was not allocating its memory at build time thought the shape has upper bound * Update src/plugins/intel_gpu/src/graph/permute.cpp Co-authored-by: Sergey Shlyapnikov <Sergeishlyapnikov@gmail.com> * Fix as review comment --------- Co-authored-by: Sergey Shlyapnikov <Sergeishlyapnikov@gmail.com>
This commit is contained in:
parent
2afcd950fa
commit
c145d8f1e9
@ -108,7 +108,8 @@ std::string permute_inst::to_string(permute_node const& node) {
|
||||
}
|
||||
|
||||
permute_inst::typed_primitive_inst(network& network, permute_node const& node) :
|
||||
parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false) {
|
||||
parent(network, node, !node.can_be_optimized()
|
||||
&& (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound())) {
|
||||
auto permute_order = argument->permute_order;
|
||||
|
||||
auto required_order_values_size = static_cast<uint32_t>(permute_order.size());
|
||||
|
@ -1147,13 +1147,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
allocation_type type, bool reusable_across_network, bool reset = true, memory* curr_memory = nullptr) {
|
||||
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
|
||||
// Use layout with max tensor for dynamic shape with upper bound
|
||||
auto static_layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
|
||||
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
|
||||
if (curr_memory != nullptr)
|
||||
pool.release_memory(curr_memory, id, net_id);
|
||||
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable_across_network, reset);
|
||||
return pool.get_memory(layout, id, net_id, dependencies, type, reusable_across_network, reset);
|
||||
}
|
||||
return pool.get_memory(static_layout, type, reset);
|
||||
return pool.get_memory(layout, type, reset);
|
||||
};
|
||||
|
||||
auto layout = impl_params.get_output_layout(idx);
|
||||
@ -1167,6 +1166,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
return a;
|
||||
};
|
||||
|
||||
layout = cldnn::layout(layout.get_partial_shape().get_max_shape(), layout.data_type, layout.format, layout.data_padding);
|
||||
bool usm_device_allocatable = true;
|
||||
const auto& total_device_input_mem_size = std::accumulate(impl_params.input_layouts.begin(), impl_params.input_layouts.end(), (uint64_t)0, device_mem_acc);
|
||||
if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
|
||||
|
@ -209,9 +209,10 @@ reorder_inst::typed_primitive_inst(network& network) : parent(network) {
|
||||
_type = reorder::type_id();
|
||||
}
|
||||
|
||||
reorder_inst::typed_primitive_inst(network& network, reorder_node const& node)
|
||||
: parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false)
|
||||
, _req_reinterpr(node.requires_reinterpret()) {
|
||||
reorder_inst::typed_primitive_inst(network& network, reorder_node const& node) :
|
||||
parent(network, node, !node.can_be_optimized()
|
||||
&& (node.get_output_layout().is_static() || node.get_output_layout().has_upper_bound()))
|
||||
, _req_reinterpr(node.requires_reinterpret()) {
|
||||
if (node.can_be_optimized())
|
||||
reuse_input();
|
||||
|
||||
|
@ -0,0 +1,48 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/reorder.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include "program_wrapper.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
namespace bounded_shape_mem_alloc_tests {
|
||||
|
||||
TEST(dyn_shape_bounded_shape_mem, reorder) {
|
||||
auto& engine = get_test_engine();
|
||||
auto input_lay = layout{ov::PartialShape{ov::Dimension(1, 10), ov::Dimension(1, 20)}, data_types::f32, format::bfyx};
|
||||
topology topology(input_layout("input", input_lay),
|
||||
reorder("reorder", input_info("input"), format::bfyx, data_types::f16));
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
|
||||
const auto reorder_mem = network.get_primitive("reorder")->output_memory_ptr();
|
||||
ASSERT_NE(reorder_mem, nullptr);
|
||||
ASSERT_EQ(reorder_mem->get_layout().count(), 10*20);
|
||||
}
|
||||
|
||||
TEST(dyn_shape_bounded_shape_mem, permute) {
|
||||
auto& engine = get_test_engine();
|
||||
auto input_lay = layout{ov::PartialShape{ov::Dimension(1, 10), ov::Dimension(1,15), ov::Dimension(1,5), ov::Dimension(1, 20)}, data_types::f32, format::bfyx};
|
||||
topology topology(input_layout("input", input_lay),
|
||||
permute("permute", input_info("input"), {0, 2, 3, 1}));
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology, config);
|
||||
|
||||
const auto permute_mem = network.get_primitive("permute")->output_memory_ptr();
|
||||
ASSERT_NE(permute_mem, nullptr);
|
||||
ASSERT_EQ(permute_mem->get_layout().count(), 10*15*5*20);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user