Disable redundant reset for internal buffer (#18447)
This commit is contained in:
committed by
GitHub
parent
0927e867b0
commit
8f513002b6
@@ -233,7 +233,7 @@ public:
|
||||
bool needs_completion_event() const { return _needs_completion_event; }
|
||||
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
|
||||
bool has_inner_networks() const;
|
||||
void allocate_internal_buffers();
|
||||
void allocate_internal_buffers(bool reset = true);
|
||||
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
|
||||
bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false, memory* curr_memory = nullptr, bool runtime_alloc = false);
|
||||
|
||||
@@ -339,7 +339,7 @@ protected:
|
||||
std::vector<size_t> max_intermediates_memory_sizes;
|
||||
|
||||
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
|
||||
memory::ptr allocate_internal_buffer(size_t idx);
|
||||
memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
|
||||
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
|
||||
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
|
||||
int32_t get_index_in_deps(memory::cptr arg) const;
|
||||
|
||||
@@ -379,6 +379,7 @@ event::ptr primitive_inst::realloc_if_needed() {
|
||||
// TODO : need to handle multiple outputs
|
||||
max_output_layout_size = updated_params.output_layouts[0].count();
|
||||
}
|
||||
_mem_allocated = true;
|
||||
// intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
|
||||
{
|
||||
if (_impl == nullptr)
|
||||
@@ -392,12 +393,15 @@ event::ptr primitive_inst::realloc_if_needed() {
|
||||
// can reuse
|
||||
_intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
|
||||
} else {
|
||||
// TODO: If there is a kernel which requires reset internal buffer in the future,
|
||||
// we'll need additional handle for that purpose like need_reset_output_memory
|
||||
bool need_reset = false;
|
||||
if (i < _intermediates_memory.size()) {
|
||||
_intermediates_memory[i] = allocate_internal_buffer(i);
|
||||
_intermediates_memory[i] = allocate_internal_buffer(i, need_reset);
|
||||
max_intermediates_memory_sizes[i] = _intermediates_memory[i]->size();
|
||||
} else {
|
||||
// i-th layout has not been allocated yet
|
||||
_intermediates_memory.push_back(allocate_internal_buffer(i));
|
||||
_intermediates_memory.push_back(allocate_internal_buffer(i, need_reset));
|
||||
max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
|
||||
}
|
||||
}
|
||||
@@ -879,7 +883,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
|
||||
max_output_layout_size = _outputs[0]->get_layout().get_tensor().count();
|
||||
}
|
||||
|
||||
memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
|
||||
memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
|
||||
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
|
||||
return nullptr;
|
||||
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||
@@ -925,15 +929,20 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
|
||||
auto layout = ibuf_layouts[idx];
|
||||
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
|
||||
auto alloc_type = allocation_type::unknown;
|
||||
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
|
||||
if (input_device_mem && ((int64_t) available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
|
||||
GPU_DEBUG_LOG << " input is device mem and available device mem size (" << available_device_mem_size
|
||||
<< ") > requested memory (" << layout.bytes_count() << " )" << std::endl;
|
||||
alloc_type = engine.get_preferred_memory_allocation_type();
|
||||
} else {
|
||||
GPU_DEBUG_LOG << " input is not device mem or available device mem size ("
|
||||
<< available_device_mem_size << ") <= requested memory (" << layout.bytes_count() << " )" << std::endl;
|
||||
alloc_type = engine.get_lockable_preferred_memory_allocation_type();
|
||||
}
|
||||
return engine.allocate_memory(layout, alloc_type);
|
||||
GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;
|
||||
return engine.allocate_memory(layout, alloc_type, reset);
|
||||
}
|
||||
|
||||
void primitive_inst::allocate_internal_buffers(void) {
|
||||
void primitive_inst::allocate_internal_buffers(bool reset) {
|
||||
if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
|
||||
return;
|
||||
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||
@@ -945,7 +954,7 @@ void primitive_inst::allocate_internal_buffers(void) {
|
||||
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
|
||||
if (ibuf_layouts[i].get_linear_size() == 0)
|
||||
continue;
|
||||
intermediates_memory.push_back(allocate_internal_buffer(i));
|
||||
intermediates_memory.push_back(allocate_internal_buffer(i, reset));
|
||||
max_intermediates_memory_sizes.push_back(intermediates_memory[i]->size());
|
||||
}
|
||||
_intermediates_memory = intermediates_memory;
|
||||
|
||||
@@ -146,7 +146,13 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
|
||||
layout(ov::PartialShape{ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}},
|
||||
data_types::f32,
|
||||
format::bfyx);
|
||||
network network(engine, topology(input_layout("input", in_layout), softmax("softmax", input_info("input"), 3)), get_test_default_config(engine));
|
||||
auto config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
network network(engine, topology(input_layout("input", in_layout),
|
||||
reorder("reorder", input_info("input"), format::bfyx, data_types::f16),
|
||||
softmax("softmax", input_info("reorder"), 3),
|
||||
reorder("reorder2", input_info("softmax"), format::bfyx, data_types::f32)),
|
||||
config);
|
||||
|
||||
// First run
|
||||
float out_buffer_1[out_size_1];
|
||||
@@ -186,6 +192,9 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
|
||||
ASSERT_EQ(internal_mems_1.size(), internal_mems_2.size());
|
||||
for (size_t i = 0; i < internal_mems_1.size(); ++i) {
|
||||
ASSERT_EQ(internal_mems_1[i]->buffer_ptr(), internal_mems_2[i]->buffer_ptr());
|
||||
if (engine.get_device_info().supports_immad) {
|
||||
ASSERT_EQ(internal_mems_1[i]->get_allocation_type(), allocation_type::usm_device);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user