[GPU] Optimize realloc for dynamic shape (#15169)
* Optimize realloc for dynamic shape with - Pre-aligned alloc for bounded dynamic shape - Reuse internal buffer * - Fix internal buffer of NMS kernel to be reused - Fixed bug in nms quick sort * Additional fix for internal buffer reuse * Fix legacy dynamic batch to be applied only for 0-th dim dynamic shape with upper bound * Fix unittest error * Apply nms fixes of padding -1 to all buffers only when internal buffer is reused * Not to have separate get_max_tensor, becuase currently there is no needs for that separate API. Currently max tensor is only needed for memory allocation, and there is no need for minimum tensor size for now * Fix allocation of internal buffer to be done for each layout
This commit is contained in:
committed by
GitHub
parent
ce4c082cb2
commit
cd9e772802
@@ -476,6 +476,14 @@ struct layout {
|
||||
|
||||
bool is_dynamic() const;
|
||||
|
||||
bool has_upper_bound() const {
|
||||
for (auto i : size) {
|
||||
if (i.get_max_length() == -1)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_static() const;
|
||||
|
||||
ov::PartialShape get_partial_shape() const;
|
||||
|
||||
@@ -145,23 +145,22 @@ std::string crop_inst::to_string(crop_node const& node) {
|
||||
auto ref_in_sizes = desc->reference_input;
|
||||
const auto& offsets = desc->offsets;
|
||||
const auto in_layout = node.input().get_output_layout();
|
||||
const auto& in_sizes = in_layout.get_tensor();
|
||||
|
||||
auto node_info = node.desc_to_json();
|
||||
|
||||
// Check for borders variant of crop.
|
||||
if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || ref_in_sizes.spatial[0] < 0 ||
|
||||
ref_in_sizes.spatial[1] < 0 || ref_in_sizes.spatial[2] < 0) {
|
||||
// Ignore not supported dimensions.
|
||||
const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0, 0});
|
||||
const auto lt_sizes = offsets.sub({0, 0, 0, 0, 0});
|
||||
|
||||
ref_in_sizes = in_sizes - (rb_sizes + lt_sizes);
|
||||
}
|
||||
|
||||
std::stringstream primitive_description;
|
||||
|
||||
json_composite crop_info;
|
||||
|
||||
if (!in_layout.is_dynamic()) {
|
||||
const auto& in_sizes = in_layout.get_tensor();
|
||||
|
||||
// Check for borders variant of crop.
|
||||
if (ref_in_sizes.batch[0] < 0 || ref_in_sizes.feature[0] < 0 || ref_in_sizes.spatial[0] < 0 ||
|
||||
ref_in_sizes.spatial[1] < 0 || ref_in_sizes.spatial[2] < 0) {
|
||||
// Ignore not supported dimensions.
|
||||
const auto rb_sizes = ref_in_sizes.negate().sub({0, 0, 0, 0, 0});
|
||||
const auto lt_sizes = offsets.sub({0, 0, 0, 0, 0});
|
||||
ref_in_sizes = in_sizes - (rb_sizes + lt_sizes);
|
||||
}
|
||||
}
|
||||
crop_info.add("reference input size", ref_in_sizes.to_string());
|
||||
crop_info.add("offset", offsets.to_string());
|
||||
|
||||
|
||||
@@ -144,6 +144,10 @@ public:
|
||||
params.sort_result_descending = primitive->sort_result_descending;
|
||||
params.box_encoding = primitive->center_point_box ? kernel_selector::BoxEncodingType::BOX_ENCODING_CENTER
|
||||
: kernel_selector::BoxEncodingType::BOX_ENCODING_CORNER;
|
||||
if (impl_param.get_program().get_node(primitive->id).is_dynamic()) {
|
||||
params.reuse_internal_buffer = true;
|
||||
}
|
||||
|
||||
auto& kernel_selector = kernel_selector::non_max_suppression_kernel_selector::Instance();
|
||||
auto best_kernel = kernel_selector.get_best_kernel(params, optional_params);
|
||||
|
||||
|
||||
@@ -276,8 +276,10 @@ protected:
|
||||
bool _is_constant = false;
|
||||
|
||||
size_t max_output_layout_size = 0;
|
||||
std::vector<size_t> max_intermediates_memory_sizes;
|
||||
|
||||
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr);
|
||||
memory::ptr allocate_internal_buffer(size_t idx);
|
||||
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
|
||||
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
|
||||
void convert_args(const kernel_arguments_data& args, kernel_arguments_data_idx& args_idx) const;
|
||||
@@ -443,8 +445,9 @@ protected:
|
||||
|
||||
private:
|
||||
bool do_allocate_memory(typed_node const& typ_node) {
|
||||
if (typ_node.get_output_layout().is_dynamic())
|
||||
return false;
|
||||
if (typ_node.get_output_layout().is_dynamic() && !typ_node.get_output_layout().has_upper_bound()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (typ_node.template have_user_with_type<concatenation>() && typ_node.get_users().size() == 1 &&
|
||||
typ_node.get_users().front()->can_be_optimized()) { // check if the only user is concat
|
||||
|
||||
@@ -781,14 +781,25 @@ void network::allocate_primitives() {
|
||||
std::sort(nodes_to_allocate.begin(),
|
||||
nodes_to_allocate.end(),
|
||||
[&po](std::shared_ptr<program_node> const& lhs, std::shared_ptr<program_node> const& rhs) {
|
||||
if (rhs->get_output_layout().is_dynamic() && lhs->get_output_layout().is_dynamic())
|
||||
auto lhs_layout = lhs->get_output_layout();
|
||||
auto rhs_layout = rhs->get_output_layout();
|
||||
if (lhs_layout.is_dynamic() && lhs_layout.has_upper_bound()) {
|
||||
lhs_layout.set_tensor(lhs_layout.get_tensor());
|
||||
}
|
||||
if (rhs_layout.is_dynamic() && rhs_layout.has_upper_bound()) {
|
||||
rhs_layout.set_tensor(rhs_layout.get_tensor());
|
||||
}
|
||||
|
||||
if (rhs_layout.is_dynamic() && !rhs_layout.has_upper_bound() && lhs_layout.is_dynamic() && !lhs_layout.has_upper_bound()) {
|
||||
return po.get_processing_number(lhs.get()) < po.get_processing_number(rhs.get());
|
||||
if (rhs->get_output_layout().is_dynamic())
|
||||
}
|
||||
|
||||
if (rhs_layout.is_dynamic())
|
||||
return true;
|
||||
if (lhs->get_output_layout().is_dynamic())
|
||||
if (lhs_layout.is_dynamic())
|
||||
return false;
|
||||
|
||||
return (lhs->get_output_layout().bytes_count() > rhs->get_output_layout().bytes_count());
|
||||
return (lhs_layout.bytes_count() > rhs_layout.bytes_count());
|
||||
});
|
||||
|
||||
for (auto const& node : nodes_to_allocate) {
|
||||
|
||||
@@ -261,7 +261,27 @@ void primitive_inst::realloc_if_needed() {
|
||||
max_output_layout_size = updated_params.output_layouts[0].count();
|
||||
}
|
||||
// intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
|
||||
allocate_internal_buffers();
|
||||
{
|
||||
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||
if (ibuf_layouts.empty())
|
||||
return;
|
||||
|
||||
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
|
||||
if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
|
||||
// can reuse
|
||||
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
|
||||
} else {
|
||||
if (i < _intermediates_memory.size()) {
|
||||
_intermediates_memory[i] = allocate_internal_buffer(i);
|
||||
max_intermediates_memory_sizes[i] = _intermediates_memory[i]->size();
|
||||
} else {
|
||||
// i-th layout has not been allocated yet
|
||||
_intermediates_memory.push_back(allocate_internal_buffer(i));
|
||||
max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void primitive_inst::update_impl() {
|
||||
@@ -584,15 +604,15 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
|
||||
}
|
||||
|
||||
if (_outputs[0])
|
||||
max_output_layout_size = _outputs[0]->get_layout().count();
|
||||
max_output_layout_size = _outputs[0]->get_layout().get_tensor().count();
|
||||
}
|
||||
|
||||
void primitive_inst::allocate_internal_buffers(void) {
|
||||
memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
|
||||
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
|
||||
return;
|
||||
return nullptr;
|
||||
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||
if (ibuf_layouts.empty())
|
||||
return;
|
||||
return nullptr;
|
||||
|
||||
auto device_mem_acc = [&](size_t a, std::pair<std::shared_ptr<primitive_inst>, int32_t> b) {
|
||||
if (!b.first->mem_allocated()) return a;
|
||||
@@ -629,18 +649,30 @@ void primitive_inst::allocate_internal_buffers(void) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// allocate intermediate memory for the updated layout of buffer
|
||||
auto layout = ibuf_layouts[idx];
|
||||
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
|
||||
auto alloc_type = allocation_type::unknown;
|
||||
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
|
||||
alloc_type = engine.get_preferred_memory_allocation_type();
|
||||
} else {
|
||||
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
|
||||
}
|
||||
return engine.allocate_memory(layout, alloc_type);
|
||||
}
|
||||
|
||||
void primitive_inst::allocate_internal_buffers(void) {
|
||||
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
|
||||
return;
|
||||
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||
if (ibuf_layouts.empty())
|
||||
return;
|
||||
|
||||
// allocate intermediate memory for the updated layout of buffer
|
||||
std::vector<memory::cptr> intermediates_memory;
|
||||
for (auto layout : ibuf_layouts) {
|
||||
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf]" << std::endl;
|
||||
auto alloc_type = allocation_type::unknown;
|
||||
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
|
||||
alloc_type = engine.get_preferred_memory_allocation_type();
|
||||
} else {
|
||||
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
|
||||
}
|
||||
intermediates_memory.push_back(engine.allocate_memory(layout, alloc_type));
|
||||
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
|
||||
intermediates_memory.push_back(allocate_internal_buffer(i));
|
||||
max_intermediates_memory_sizes.push_back(intermediates_memory[i]->size());
|
||||
}
|
||||
_intermediates_memory = intermediates_memory;
|
||||
}
|
||||
@@ -747,13 +779,16 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
uint32_t net_id, bool is_internal, size_t idx) {
|
||||
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
|
||||
allocation_type type, bool reusable) {
|
||||
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
|
||||
// Use layout with max tensor for dynamic shape with upper bound
|
||||
auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
|
||||
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
|
||||
return pool.get_memory(layout, id, net_id, dependencies, type, reusable);
|
||||
return pool.get_memory(layout, type);
|
||||
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable);
|
||||
return pool.get_memory(static_layout, type);
|
||||
};
|
||||
|
||||
auto layout = impl_params.get_output_layout(idx);
|
||||
OPENVINO_ASSERT(layout.is_static(), "[GPU] Can't allocate output for dynamic layout");
|
||||
OPENVINO_ASSERT(layout.is_static() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout");
|
||||
auto device_mem_acc = [&](size_t a, const cldnn::layout& l) {
|
||||
// Input shape may be dynamic is some cases (shape_of). It means that output shape of node doesn't depend on input shape
|
||||
// and out memory can be allocated on program build stage.
|
||||
|
||||
@@ -478,14 +478,21 @@ KERNEL (non_max_suppression_ref_stage_2)(
|
||||
if (convert_float(next_candidate.score) > SCORE_THRESHOLD_VAL) {
|
||||
--i;
|
||||
sortedBoxList[i] = next_candidate;
|
||||
FUNC_CALL(quickSortIterative)(sortedBoxList, i, kSortedBoxNum);
|
||||
FUNC_CALL(quickSortIterative)(sortedBoxList, i, kSortedBoxNum - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set pad value to indicate the end of selected box list.
|
||||
if (selectedBoxNum < NUM_BOXES) {
|
||||
selectedBoxList[selectedBoxNum].batchId = -1;
|
||||
int b = selectedBoxNum;
|
||||
#ifdef REUSE_INTERNAL_BUFFER
|
||||
for (; b < NUM_BOXES; ++b) {
|
||||
selectedBoxList[b].batchId = -1;
|
||||
}
|
||||
#else
|
||||
selectedBoxList[b].batchId = -1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif /* NMS_STAGE_2 */
|
||||
|
||||
@@ -292,6 +292,8 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params, con
|
||||
, MakeJitConstant("LOCAL_CLASS_NUM", dispatchData.lws[1])
|
||||
, MakeJitConstant("LOCAL_WORK_NUM", dispatchData.lws[2])
|
||||
, MakeJitConstant("PARTITION_STEP", GetPartitionStep(static_cast<int>(dispatchData.lws[2])))});
|
||||
} else if (i == 2 && orgParams.reuse_internal_buffer) {
|
||||
cldnn_jit.AddConstant({ MakeJitConstant("REUSE_INTERNAL_BUFFER", 1)});
|
||||
}
|
||||
cldnn_jit.AddConstant(MakeJitConstant("NMS_STAGE_" + std::to_string(i), "true"));
|
||||
|
||||
|
||||
@@ -41,6 +41,7 @@ struct non_max_suppression_params : public base_params {
|
||||
bool has_second_output;
|
||||
bool has_third_output;
|
||||
bool use_multiple_outputs;
|
||||
bool reuse_internal_buffer = false;
|
||||
|
||||
uint32_t GetIndexNumSelectPerClass() const {
|
||||
uint32_t input_idx = 2;
|
||||
|
||||
@@ -75,51 +75,45 @@ bool Program::IsDynBatchModel(const std::shared_ptr<ov::Model>& model,
|
||||
return false;
|
||||
}
|
||||
ov::PartialShape pshape = param->get_output_partial_shape(0);
|
||||
int dynCount = 0;
|
||||
int64_t batch_idx = -1;
|
||||
for (size_t i = 0; i < pshape.size(); i++) {
|
||||
bool only_batch_dynamic = pshape.size() && pshape[0].is_dynamic();
|
||||
for (size_t i = 1; i < pshape.size(); i++) {
|
||||
if (pshape[i].is_dynamic()) {
|
||||
dynCount++;
|
||||
if (batch_idx < 0) {
|
||||
batch_idx = i;
|
||||
}
|
||||
// only support 0th dimension for legacy dynamic batch
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (dynCount) {
|
||||
case 1:
|
||||
// exactly one dynamic dim
|
||||
{
|
||||
int64_t max_b = pshape[batch_idx].get_max_length();
|
||||
if (max_b > 1) {
|
||||
batch_dim[pname].first = batch_idx;
|
||||
batch_dim[pname].second = max_b;
|
||||
pshape[batch_idx] = 1;
|
||||
}
|
||||
}
|
||||
case 0:
|
||||
// no dynamic dims - possible legacy case
|
||||
shapes[pname] = pshape;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
if (only_batch_dynamic) {
|
||||
int64_t max_b = pshape[0].get_max_length();
|
||||
if (max_b > 1) {
|
||||
batch_dim[pname].first = 0;
|
||||
batch_dim[pname].second = max_b;
|
||||
pshape[0] = 1;
|
||||
} else {
|
||||
// unbounded dynamic shape should be handled with new dynamic shape path
|
||||
return false;
|
||||
}
|
||||
}
|
||||
shapes[pname] = pshape;
|
||||
}
|
||||
if (batch_dim.empty())
|
||||
return false;
|
||||
|
||||
bool dyn_shape_batch_found = false;
|
||||
// detect 1st dyn dim, mark it and continue
|
||||
auto bitr = batch_dim.begin();
|
||||
dyn_shape_batch_found = bitr->second.first >= 0;
|
||||
dyn_shape_batch_found = (bitr->second.first == 0);
|
||||
auto batch_val_1st = bitr->second.second;
|
||||
bitr++;
|
||||
for (; bitr != batch_dim.end(); bitr++) {
|
||||
if (bitr->second.first >= 0) {
|
||||
if (bitr->second.first == 0) {
|
||||
if (bitr->second.second != batch_val_1st) {
|
||||
dyn_shape_batch_found = false;
|
||||
break;
|
||||
} else {
|
||||
dyn_shape_batch_found = true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return dyn_shape_batch_found;
|
||||
|
||||
@@ -317,10 +317,16 @@ ov::Shape layout::get_shape() const {
|
||||
}
|
||||
|
||||
tensor layout::get_tensor() const {
|
||||
if (is_dynamic())
|
||||
throw std::runtime_error("[GPU] get_tensor() is called for dynamic shape");
|
||||
OPENVINO_ASSERT(!is_dynamic() || has_upper_bound(), "[GPU] get_tensor() is called for dynamic shape without upper bound");
|
||||
ov::Shape shape;
|
||||
if (is_dynamic() && has_upper_bound()) {
|
||||
for (auto dim : size) {
|
||||
shape.push_back(dim.get_max_length());
|
||||
}
|
||||
} else {
|
||||
shape = size.to_shape();
|
||||
}
|
||||
|
||||
auto shape = size.to_shape();
|
||||
std::vector<tensor::value_type> dims(shape.begin(), shape.end());
|
||||
|
||||
auto rank = std::max(format.dimension(), dims.size());
|
||||
@@ -360,8 +366,9 @@ void layout::set_partial_shape(const ov::PartialShape& size) {
|
||||
}
|
||||
|
||||
tensor layout::get_buffer_size() const {
|
||||
if (is_dynamic())
|
||||
throw std::runtime_error("[GPU] get_buffer_size() is called for dynamic shape");
|
||||
if (is_dynamic() && !has_upper_bound()) {
|
||||
throw std::runtime_error("[GPU] get_buffer_size() is called for dynamic shape");
|
||||
}
|
||||
|
||||
auto t = get_tensor();
|
||||
|
||||
|
||||
@@ -128,7 +128,7 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const
|
||||
}
|
||||
|
||||
memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) {
|
||||
OPENVINO_ASSERT(!layout.is_dynamic(), "[GPU] Can't allocate memory for dynamic layout");
|
||||
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout");
|
||||
|
||||
OPENVINO_ASSERT(layout.bytes_count() <= get_device_info().max_alloc_mem_size,
|
||||
"[GPU] Exceeded max size of memory object allocation: ",
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/softmax.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include "softmax_inst.h"
|
||||
|
||||
#include "program_wrapper.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
namespace memory_realloc_tests {
|
||||
|
||||
TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
|
||||
static const int32_t
|
||||
output_x_1 = 10, output_b_1 = 8,
|
||||
input_x_1 = 10, input_b_1 = 8,
|
||||
out_size_1 = output_x_1 * output_b_1,
|
||||
output_x_2 = 10, output_b_2 = 4,
|
||||
input_x_2 = 10, input_b_2 = 4,
|
||||
out_size_2 = output_x_2 * output_b_2;
|
||||
|
||||
cldnn::engine& engine = get_test_engine();
|
||||
|
||||
auto compare_out_buffer_with_expected = [&](float* out_buffer, std::vector<float>& expected_buffer, size_t size) {
|
||||
for(size_t i = 0; i < size; ++i) {
|
||||
// does output have expected values
|
||||
ASSERT_TRUE(are_equal(out_buffer[i], expected_buffer[i]))
|
||||
<< "At ["<< i << "] Expected : " << expected_buffer[i] << " actual : " << out_buffer[i];
|
||||
}
|
||||
};
|
||||
auto in_layout =
|
||||
layout(ov::PartialShape{ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}},
|
||||
data_types::f32,
|
||||
format::bfyx);
|
||||
network network(engine, topology(input_layout("input", in_layout), softmax("softmax", input_info("input"), 3)));
|
||||
|
||||
// First run
|
||||
float out_buffer_1[out_size_1];
|
||||
std::vector<float> in_b_1(out_size_1, 1.0f);
|
||||
std::vector<float> expected_buffer_1(out_size_1, 0.1f);
|
||||
cldnn::memory::ptr input_1 = engine.allocate_memory({ data_types::f32, format::bfyx, {input_b_1, 1, input_x_1, 1}});
|
||||
set_values(input_1, in_b_1);
|
||||
network.set_input_data("input", input_1);
|
||||
|
||||
auto outputs_1 = network.execute();
|
||||
auto output_mem_1 = outputs_1.begin()->second.get_memory();
|
||||
auto internal_mems_1 = network.get_primitive("softmax")->get_intermediates_memories();
|
||||
cldnn::mem_lock<float> output_ptr_1(output_mem_1, get_test_stream());
|
||||
for (uint32_t i = 0; i < out_size_1; i++) {
|
||||
out_buffer_1[i] = output_ptr_1[i];
|
||||
}
|
||||
compare_out_buffer_with_expected(out_buffer_1, expected_buffer_1, out_size_1);
|
||||
|
||||
// Second run
|
||||
float out_buffer_2[out_size_2];
|
||||
std::vector<float> in_b_2(out_size_2, 2.0f);
|
||||
std::vector<float> expected_buffer_2(out_size_2, 0.1f);
|
||||
cldnn::memory::ptr input_2 = engine.allocate_memory({ data_types::f32, format::bfyx, {input_b_2, 1, input_x_2, 1}});
|
||||
set_values(input_2, in_b_2);
|
||||
network.set_input_data("input", input_2);
|
||||
auto outputs_2 = network.execute();
|
||||
auto output_mem_2 = outputs_2.begin()->second.get_memory();
|
||||
auto internal_mems_2 = network.get_primitive("softmax")->get_intermediates_memories();
|
||||
cldnn::mem_lock<float> output_ptr_2(output_mem_2, get_test_stream());
|
||||
for (uint32_t i = 0; i < out_size_2; i++) {
|
||||
out_buffer_2[i] = output_ptr_2[i];
|
||||
}
|
||||
compare_out_buffer_with_expected(out_buffer_2, expected_buffer_2, out_size_2);
|
||||
|
||||
// Check output is not reallocated
|
||||
ASSERT_EQ(output_ptr_1.data(), output_ptr_2.data());
|
||||
ASSERT_EQ(internal_mems_1.size(), internal_mems_2.size());
|
||||
for (size_t i = 0; i < internal_mems_1.size(); ++i) {
|
||||
ASSERT_EQ(internal_mems_1[i]->buffer_ptr(), internal_mems_2[i]->buffer_ptr());
|
||||
}
|
||||
}
|
||||
} // memory_realloc_tests
|
||||
Reference in New Issue
Block a user