[GPU] Optimize realloc for dynamic shape (#15169)

* Optimize realloc for dynamic shape with
- Pre-aligned alloc for bounded dynamic shape
- Reuse internal buffer

* - Fix internal buffer of NMS kernel to be reused
- Fixed bug in nms quick sort

* Additional fix for internal buffer reuse

* Fix legacy dynamic batch to be applied only for 0-th dim dynamic shape with upper bound

* Fix unittest error

* Apply nms fixes of padding -1 to all buffers only when internal buffer is reused

* Not to have separate get_max_tensor, becuase currently there is no needs for that separate API.
Currently max tensor is only needed for memory allocation, and there is no need for minimum tensor size for now

* Fix allocation of internal buffer to be done for each layout
This commit is contained in:
Taylor Yeonbok Lee
2023-01-27 00:40:31 -08:00
committed by GitHub
parent ce4c082cb2
commit cd9e772802
13 changed files with 229 additions and 71 deletions

View File

@@ -476,6 +476,14 @@ struct layout {
bool is_dynamic() const;
bool has_upper_bound() const {
for (auto i : size) {
if (i.get_max_length() == -1)
return false;
}
return true;
}
bool is_static() const;
ov::PartialShape get_partial_shape() const;

View File

@@ -145,23 +145,22 @@ std::string crop_inst::to_string(crop_node const& node) {
auto ref_in_sizes = desc->reference_input;
const auto& offsets = desc->offsets;
const auto in_layout = node.input().get_output_layout();
const auto& in_sizes = in_layout.get_tensor();
auto node_info = node.desc_to_json();
// Check for borders variant of crop.
if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || ref_in_sizes.spatial[0] < 0 ||
ref_in_sizes.spatial[1] < 0 || ref_in_sizes.spatial[2] < 0) {
// Ignore not supported dimensions.
const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0, 0});
const auto lt_sizes = offsets.sub({0, 0, 0, 0, 0});
ref_in_sizes = in_sizes - (rb_sizes + lt_sizes);
}
std::stringstream primitive_description;
json_composite crop_info;
if (!in_layout.is_dynamic()) {
const auto& in_sizes = in_layout.get_tensor();
// Check for borders variant of crop.
if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || ref_in_sizes.spatial[0] < 0 ||
ref_in_sizes.spatial[1] < 0 || ref_in_sizes.spatial[2] < 0) {
// Ignore not supported dimensions.
const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0, 0});
const auto lt_sizes = offsets.sub({0, 0, 0, 0, 0});
ref_in_sizes = in_sizes - (rb_sizes + lt_sizes);
}
}
crop_info.add("reference input size", ref_in_sizes.to_string());
crop_info.add("offset", offsets.to_string());

View File

@@ -144,6 +144,10 @@ public:
params.sort_result_descending = primitive->sort_result_descending;
params.box_encoding = primitive->center_point_box ? kernel_selector::BoxEncodingType::BOX_ENCODING_CENTER
: kernel_selector::BoxEncodingType::BOX_ENCODING_CORNER;
if (impl_param.get_program().get_node(primitive->id).is_dynamic()) {
params.reuse_internal_buffer = true;
}
auto& kernel_selector = kernel_selector::non_max_suppression_kernel_selector::Instance();
auto best_kernel = kernel_selector.get_best_kernel(params, optional_params);

View File

@@ -276,8 +276,10 @@ protected:
bool _is_constant = false;
size_t max_output_layout_size = 0;
std::vector<size_t> max_intermediates_memory_sizes;
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr);
memory::ptr allocate_internal_buffer(size_t idx);
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
void convert_args(const kernel_arguments_data& args, kernel_arguments_data_idx& args_idx) const;
@@ -443,8 +445,9 @@ protected:
private:
bool do_allocate_memory(typed_node const& typ_node) {
if (typ_node.get_output_layout().is_dynamic())
return false;
if (typ_node.get_output_layout().is_dynamic() && !typ_node.get_output_layout().has_upper_bound()) {
return false;
}
if (typ_node.template have_user_with_type<concatenation>() && typ_node.get_users().size() == 1 &&
typ_node.get_users().front()->can_be_optimized()) { // check if the only user is concat

View File

@@ -781,14 +781,25 @@ void network::allocate_primitives() {
std::sort(nodes_to_allocate.begin(),
nodes_to_allocate.end(),
[&po](std::shared_ptr<program_node> const& lhs, std::shared_ptr<program_node> const& rhs) {
if (rhs->get_output_layout().is_dynamic() && lhs->get_output_layout().is_dynamic())
auto lhs_layout = lhs->get_output_layout();
auto rhs_layout = rhs->get_output_layout();
if (lhs_layout.is_dynamic() && lhs_layout.has_upper_bound()) {
lhs_layout.set_tensor(lhs_layout.get_tensor());
}
if (rhs_layout.is_dynamic() && rhs_layout.has_upper_bound()) {
rhs_layout.set_tensor(rhs_layout.get_tensor());
}
if (rhs_layout.is_dynamic() && !rhs_layout.has_upper_bound() && lhs_layout.is_dynamic() && !lhs_layout.has_upper_bound()) {
return po.get_processing_number(lhs.get()) < po.get_processing_number(rhs.get());
if (rhs->get_output_layout().is_dynamic())
}
if (rhs_layout.is_dynamic())
return true;
if (lhs->get_output_layout().is_dynamic())
if (lhs_layout.is_dynamic())
return false;
return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
return (lhs_layout.bytes_count() > rhs_layout.bytes_count());
});
for (auto const& node : nodes_to_allocate) {

View File

@@ -261,7 +261,27 @@ void primitive_inst::realloc_if_needed() {
max_output_layout_size = updated_params.output_layouts[0].count();
}
// intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
allocate_internal_buffers();
{
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
if (ibuf_layouts.empty())
return;
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
// can reuse
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
} else {
if (i < _intermediates_memory.size()) {
_intermediates_memory[i] = allocate_internal_buffer(i);
max_intermediates_memory_sizes[i] = _intermediates_memory[i]->size();
} else {
// i-th layout has not been allocated yet
_intermediates_memory.push_back(allocate_internal_buffer(i));
max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
}
}
}
}
}
void primitive_inst::update_impl() {
@@ -584,15 +604,15 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
}
if (_outputs[0])
max_output_layout_size = _outputs[0]->get_layout().count();
max_output_layout_size = _outputs[0]->get_layout().get_tensor().count();
}
void primitive_inst::allocate_internal_buffers(void) {
memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
return;
return nullptr;
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
if (ibuf_layouts.empty())
return;
return nullptr;
auto device_mem_acc = [&](size_t a, std::pair<std::shared_ptr<primitive_inst>, int32_t> b) {
if (!b.first->mem_allocated()) return a;
@@ -629,18 +649,30 @@ void primitive_inst::allocate_internal_buffers(void) {
}
}
}
// allocate intermediate memory for the updated layout of buffer
auto layout = ibuf_layouts[idx];
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
auto alloc_type = allocation_type::unknown;
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
alloc_type = engine.get_preferred_memory_allocation_type();
} else {
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
}
return engine.allocate_memory(layout, alloc_type);
}
void primitive_inst::allocate_internal_buffers(void) {
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
return;
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
if (ibuf_layouts.empty())
return;
// allocate intermediate memory for the updated layout of buffer
std::vector<memory::cptr> intermediates_memory;
for (auto layout : ibuf_layouts) {
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf]" << std::endl;
auto alloc_type = allocation_type::unknown;
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
alloc_type = engine.get_preferred_memory_allocation_type();
} else {
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
}
intermediates_memory.push_back(engine.allocate_memory(layout, alloc_type));
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
intermediates_memory.push_back(allocate_internal_buffer(i));
max_intermediates_memory_sizes.push_back(intermediates_memory[i]->size());
}
_intermediates_memory = intermediates_memory;
}
@@ -747,13 +779,16 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
uint32_t net_id, bool is_internal, size_t idx) {
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
allocation_type type, bool reusable) {
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
// Use layout with max tensor for dynamic shape with upper bound
auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
return pool.get_memory(layout, id, net_id, dependencies, type, reusable);
return pool.get_memory(layout, type);
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable);
return pool.get_memory(static_layout, type);
};
auto layout = impl_params.get_output_layout(idx);
OPENVINO_ASSERT(layout.is_static(), "[GPU] Can't allocate output for dynamic layout");
OPENVINO_ASSERT(layout.is_static() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout");
auto device_mem_acc = [&](size_t a, const cldnn::layout& l) {
// Input shape may be dynamic is some cases (shape_of). It means that output shape of node doesn't depend on input shape
// and out memory can be allocated on program build stage.

View File

@@ -478,14 +478,21 @@ KERNEL (non_max_suppression_ref_stage_2)(
if (convert_float(next_candidate.score) > SCORE_THRESHOLD_VAL) {
--i;
sortedBoxList[i] = next_candidate;
FUNC_CALL(quickSortIterative)(sortedBoxList, i, kSortedBoxNum);
FUNC_CALL(quickSortIterative)(sortedBoxList, i, kSortedBoxNum - 1);
}
}
}
// Set pad value to indicate the end of selected box list.
if (selectedBoxNum < NUM_BOXES) {
selectedBoxList[selectedBoxNum].batchId = -1;
int b = selectedBoxNum;
#ifdef REUSE_INTERNAL_BUFFER
for (; b < NUM_BOXES; ++b) {
selectedBoxList[b].batchId = -1;
}
#else
selectedBoxList[b].batchId = -1;
#endif
}
}
#endif /* NMS_STAGE_2 */

View File

@@ -292,6 +292,8 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params, con
, MakeJitConstant("LOCAL_CLASS_NUM", dispatchData.lws[1])
, MakeJitConstant("LOCAL_WORK_NUM", dispatchData.lws[2])
, MakeJitConstant("PARTITION_STEP", GetPartitionStep(static_cast<int>(dispatchData.lws[2])))});
} else if (i == 2 && orgParams.reuse_internal_buffer) {
cldnn_jit.AddConstant({ MakeJitConstant("REUSE_INTERNAL_BUFFER", 1)});
}
cldnn_jit.AddConstant(MakeJitConstant("NMS_STAGE_" + std::to_string(i), "true"));

View File

@@ -41,6 +41,7 @@ struct non_max_suppression_params : public base_params {
bool has_second_output;
bool has_third_output;
bool use_multiple_outputs;
bool reuse_internal_buffer = false;
uint32_t GetIndexNumSelectPerClass() const {
uint32_t input_idx = 2;

View File

@@ -75,51 +75,45 @@ bool Program::IsDynBatchModel(const std::shared_ptr<ov::Model>& model,
return false;
}
ov::PartialShape pshape = param->get_output_partial_shape(0);
int dynCount = 0;
int64_t batch_idx = -1;
for (size_t i = 0; i < pshape.size(); i++) {
bool only_batch_dynamic = pshape.size() && pshape[0].is_dynamic();
for (size_t i = 1; i < pshape.size(); i++) {
if (pshape[i].is_dynamic()) {
dynCount++;
if (batch_idx < 0) {
batch_idx = i;
}
// only support 0th dimension for legacy dynamic batch
return false;
}
}
switch (dynCount) {
case 1:
// exactly one dynamic dim
{
int64_t max_b = pshape[batch_idx].get_max_length();
if (max_b > 1) {
batch_dim[pname].first = batch_idx;
batch_dim[pname].second = max_b;
pshape[batch_idx] = 1;
}
}
case 0:
// no dynamic dims - possible legacy case
shapes[pname] = pshape;
break;
default:
break;
if (only_batch_dynamic) {
int64_t max_b = pshape[0].get_max_length();
if (max_b > 1) {
batch_dim[pname].first = 0;
batch_dim[pname].second = max_b;
pshape[0] = 1;
} else {
// unbounded dynamic shape should be handled with new dynamic shape path
return false;
}
}
shapes[pname] = pshape;
}
if (batch_dim.empty())
return false;
bool dyn_shape_batch_found = false;
// detect 1st dyn dim, mark it and continue
auto bitr = batch_dim.begin();
dyn_shape_batch_found = bitr->second.first >= 0;
dyn_shape_batch_found = (bitr->second.first == 0);
auto batch_val_1st = bitr->second.second;
bitr++;
for (; bitr != batch_dim.end(); bitr++) {
if (bitr->second.first >= 0) {
if (bitr->second.first == 0) {
if (bitr->second.second != batch_val_1st) {
dyn_shape_batch_found = false;
break;
} else {
dyn_shape_batch_found = true;
}
} else {
return false;
}
}
return dyn_shape_batch_found;

View File

@@ -317,10 +317,16 @@ ov::Shape layout::get_shape() const {
}
tensor layout::get_tensor() const {
if (is_dynamic())
throw std::runtime_error("[GPU] get_tensor() is called for dynamic shape");
OPENVINO_ASSERT(!is_dynamic() || has_upper_bound(), "[GPU] get_tensor() is called for dynamic shape without upper bound");
ov::Shape shape;
if (is_dynamic() && has_upper_bound()) {
for (auto dim : size) {
shape.push_back(dim.get_max_length());
}
} else {
shape = size.to_shape();
}
auto shape = size.to_shape();
std::vector<tensor::value_type> dims(shape.begin(), shape.end());
auto rank = std::max(format.dimension(), dims.size());
@@ -360,8 +366,9 @@ void layout::set_partial_shape(const ov::PartialShape& size) {
}
tensor layout::get_buffer_size() const {
if (is_dynamic())
throw std::runtime_error("[GPU] get_buffer_size() is called for dynamic shape");
if (is_dynamic() && !has_upper_bound()) {
throw std::runtime_error("[GPU] get_buffer_size() is called for dynamic shape");
}
auto t = get_tensor();

View File

@@ -128,7 +128,7 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const
}
memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
OPENVINO_ASSERT(!layout.is_dynamic(), "[GPU] Can't allocate memory for dynamic layout");
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout");
OPENVINO_ASSERT(layout.bytes_count() <= get_device_info().max_alloc_mem_size,
"[GPU] Exceeded max size of memory object allocation: ",

View File

@@ -0,0 +1,87 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/softmax.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "softmax_inst.h"
#include "program_wrapper.h"
#include <cmath>
#include <algorithm>
using namespace cldnn;
using namespace ::tests;
namespace memory_realloc_tests {
TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
static const int32_t
output_x_1 = 10, output_b_1 = 8,
input_x_1 = 10, input_b_1 = 8,
out_size_1 = output_x_1 * output_b_1,
output_x_2 = 10, output_b_2 = 4,
input_x_2 = 10, input_b_2 = 4,
out_size_2 = output_x_2 * output_b_2;
cldnn::engine& engine = get_test_engine();
auto compare_out_buffer_with_expected = [&](float* out_buffer, std::vector<float>& expected_buffer, size_t size) {
for(size_t i = 0; i < size; ++i) {
// does output have expected values
ASSERT_TRUE(are_equal(out_buffer[i], expected_buffer[i]))
<< "At ["<< i << "] Expected : " << expected_buffer[i] << " actual : " << out_buffer[i];
}
};
auto in_layout =
layout(ov::PartialShape{ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}},
data_types::f32,
format::bfyx);
network network(engine, topology(input_layout("input", in_layout), softmax("softmax", input_info("input"), 3)));
// First run
float out_buffer_1[out_size_1];
std::vector<float> in_b_1(out_size_1, 1.0f);
std::vector<float> expected_buffer_1(out_size_1, 0.1f);
cldnn::memory::ptr input_1 = engine.allocate_memory({ data_types::f32, format::bfyx, {input_b_1, 1, input_x_1, 1}});
set_values(input_1, in_b_1);
network.set_input_data("input", input_1);
auto outputs_1 = network.execute();
auto output_mem_1 = outputs_1.begin()->second.get_memory();
auto internal_mems_1 = network.get_primitive("softmax")->get_intermediates_memories();
cldnn::mem_lock<float> output_ptr_1(output_mem_1, get_test_stream());
for (uint32_t i = 0; i < out_size_1; i++) {
out_buffer_1[i] = output_ptr_1[i];
}
compare_out_buffer_with_expected(out_buffer_1, expected_buffer_1, out_size_1);
// Second run
float out_buffer_2[out_size_2];
std::vector<float> in_b_2(out_size_2, 2.0f);
std::vector<float> expected_buffer_2(out_size_2, 0.1f);
cldnn::memory::ptr input_2 = engine.allocate_memory({ data_types::f32, format::bfyx, {input_b_2, 1, input_x_2, 1}});
set_values(input_2, in_b_2);
network.set_input_data("input", input_2);
auto outputs_2 = network.execute();
auto output_mem_2 = outputs_2.begin()->second.get_memory();
auto internal_mems_2 = network.get_primitive("softmax")->get_intermediates_memories();
cldnn::mem_lock<float> output_ptr_2(output_mem_2, get_test_stream());
for (uint32_t i = 0; i < out_size_2; i++) {
out_buffer_2[i] = output_ptr_2[i];
}
compare_out_buffer_with_expected(out_buffer_2, expected_buffer_2, out_size_2);
// Check output is not reallocated
ASSERT_EQ(output_ptr_1.data(), output_ptr_2.data());
ASSERT_EQ(internal_mems_1.size(), internal_mems_2.size());
for (size_t i = 0; i < internal_mems_1.size(); ++i) {
ASSERT_EQ(internal_mems_1[i]->buffer_ptr(), internal_mems_2[i]->buffer_ptr());
}
}
} // memory_realloc_tests