Disable redundant reset for internal buffer (#18447)

This commit is contained in:
Taylor Yeonbok Lee
2023-07-11 17:00:11 -07:00
committed by GitHub
parent 0927e867b0
commit 8f513002b6
3 changed files with 28 additions and 10 deletions

View File

@@ -233,7 +233,7 @@ public:
bool needs_completion_event() const { return _needs_completion_event; }
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
bool has_inner_networks() const;
void allocate_internal_buffers();
void allocate_internal_buffers(bool reset = true);
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false, memory* curr_memory = nullptr, bool runtime_alloc = false);
@@ -339,7 +339,7 @@ protected:
std::vector<size_t> max_intermediates_memory_sizes;
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
memory::ptr allocate_internal_buffer(size_t idx);
memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
int32_t get_index_in_deps(memory::cptr arg) const;

View File

@@ -379,6 +379,7 @@ event::ptr primitive_inst::realloc_if_needed() {
// TODO : need to handle multiple outputs
max_output_layout_size = updated_params.output_layouts[0].count();
}
_mem_allocated = true;
// intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
{
if (_impl == nullptr)
@@ -392,12 +393,15 @@ event::ptr primitive_inst::realloc_if_needed() {
// can reuse
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
} else {
// TODO: If there is a kernel which requires reset internal buffer in the future,
// we'll need additional handle for that purpose like need_reset_output_memory
bool need_reset = false;
if (i < _intermediates_memory.size()) {
_intermediates_memory[i] = allocate_internal_buffer(i);
_intermediates_memory[i] = allocate_internal_buffer(i, need_reset);
max_intermediates_memory_sizes[i] = _intermediates_memory[i]->size();
} else {
// i-th layout has not been allocated yet
_intermediates_memory.push_back(allocate_internal_buffer(i));
_intermediates_memory.push_back(allocate_internal_buffer(i, need_reset));
max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
}
}
@@ -879,7 +883,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
max_output_layout_size = _outputs[0]->get_layout().get_tensor().count();
}
memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
return nullptr;
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
@@ -925,15 +929,20 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
auto layout = ibuf_layouts[idx];
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
auto alloc_type = allocation_type::unknown;
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
if (input_device_mem && ((int64_t) available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
GPU_DEBUG_LOG << " input is device mem and available device mem size (" << available_device_mem_size
<< ") > requested memory (" << layout.bytes_count() << " )" << std::endl;
alloc_type = engine.get_preferred_memory_allocation_type();
} else {
GPU_DEBUG_LOG << " input is not device mem or available device mem size ("
<< available_device_mem_size << ") <= requested memory (" << layout.bytes_count() << " )" << std::endl;
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
}
return engine.allocate_memory(layout, alloc_type);
GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;
return engine.allocate_memory(layout, alloc_type, reset);
}
void primitive_inst::allocate_internal_buffers(void) {
void primitive_inst::allocate_internal_buffers(bool reset) {
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
return;
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
@@ -945,7 +954,7 @@ void primitive_inst::allocate_internal_buffers(void) {
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
if (ibuf_layouts[i].get_linear_size() == 0)
continue;
intermediates_memory.push_back(allocate_internal_buffer(i));
intermediates_memory.push_back(allocate_internal_buffer(i, reset));
max_intermediates_memory_sizes.push_back(intermediates_memory[i]->size());
}
_intermediates_memory = intermediates_memory;

View File

@@ -146,7 +146,13 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
layout(ov::PartialShape{ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}},
data_types::f32,
format::bfyx);
network network(engine, topology(input_layout("input", in_layout), softmax("softmax", input_info("input"), 3)), get_test_default_config(engine));
auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology(input_layout("input", in_layout),
reorder("reorder", input_info("input"), format::bfyx, data_types::f16),
softmax("softmax", input_info("reorder"), 3),
reorder("reorder2", input_info("softmax"), format::bfyx, data_types::f32)),
config);
// First run
float out_buffer_1[out_size_1];
@@ -186,6 +192,9 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
ASSERT_EQ(internal_mems_1.size(), internal_mems_2.size());
for (size_t i = 0; i < internal_mems_1.size(); ++i) {
ASSERT_EQ(internal_mems_1[i]->buffer_ptr(), internal_mems_2[i]->buffer_ptr());
if (engine.get_device_info().supports_immad) {
ASSERT_EQ(internal_mems_1[i]->get_allocation_type(), allocation_type::usm_device);
}
}
}