Disable redundant reset for internal buffer (#18447)

2023-07-11 17:00:11 -07:00
parent 0927e867b0
commit 8f513002b6
3 changed files with 28 additions and 10 deletions
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -233,7 +233,7 @@ public:
    bool needs_completion_event() const { return _needs_completion_event; }
    bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
    bool has_inner_networks() const;
-    void allocate_internal_buffers();
+    void allocate_internal_buffers(bool reset = true);
    static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
            bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false, memory* curr_memory = nullptr, bool runtime_alloc = false);

@@ -339,7 +339,7 @@ protected:
    std::vector<size_t> max_intermediates_memory_sizes;

    std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true, bool runtime_alloc = false);
-    memory::ptr allocate_internal_buffer(size_t idx);
+    memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
    static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
        std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
    int32_t get_index_in_deps(memory::cptr arg) const;
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -379,6 +379,7 @@ event::ptr primitive_inst::realloc_if_needed() {
        // TODO : need to handle multiple outputs
        max_output_layout_size = updated_params.output_layouts[0].count();
    }
+    _mem_allocated = true;
    // intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
    {
        if (_impl == nullptr)
@@ -392,12 +393,15 @@ event::ptr primitive_inst::realloc_if_needed() {
                // can reuse
                _intermediates_memory[i] = _network.get_engine().reinterpret_buffer(*_intermediates_memory[i], ibuf_layouts[i]);
            } else {
+                // TODO: If there is a kernel which requires reset internal buffer in the future,
+                // we'll need additional handle for that purpose like need_reset_output_memory
+                bool need_reset = false;
                if (i < _intermediates_memory.size()) {
-                    _intermediates_memory[i] = allocate_internal_buffer(i);
+                    _intermediates_memory[i] = allocate_internal_buffer(i, need_reset);
                    max_intermediates_memory_sizes[i] = _intermediates_memory[i]->size();
                } else {
                    // i-th layout has not been allocated yet
-                    _intermediates_memory.push_back(allocate_internal_buffer(i));
+                    _intermediates_memory.push_back(allocate_internal_buffer(i, need_reset));
                    max_intermediates_memory_sizes.push_back(_intermediates_memory[i]->size());
                }
            }
@@ -879,7 +883,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
        max_output_layout_size = _outputs[0]->get_layout().get_tensor().count();
 }

-memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
+memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
    if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
        return nullptr;
    const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
@@ -925,15 +929,20 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx) {
    auto layout = ibuf_layouts[idx];
    GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf " << idx << "]" << std::endl;
    auto alloc_type = allocation_type::unknown;
-    if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
+    if (input_device_mem && ((int64_t) available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
+        GPU_DEBUG_LOG << " input is device mem and available device mem size (" << available_device_mem_size
+                      << ") > requested memory (" << layout.bytes_count() << " )" << std::endl;
        alloc_type = engine.get_preferred_memory_allocation_type();
    } else {
+        GPU_DEBUG_LOG << " input is not device mem or available device mem size ("
+                      << available_device_mem_size << ") <= requested memory (" << layout.bytes_count() << " )" << std::endl;
        alloc_type = engine.get_lockable_preferred_memory_allocation_type();
    }
-    return engine.allocate_memory(layout, alloc_type);
+    GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;
+    return engine.allocate_memory(layout, alloc_type, reset);
 }

-void primitive_inst::allocate_internal_buffers(void) {
+void primitive_inst::allocate_internal_buffers(bool reset) {
    if (_impl == nullptr || _outputs.empty() || _outputs[0] == nullptr)
        return;
    const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
@@ -945,7 +954,7 @@ void primitive_inst::allocate_internal_buffers(void) {
    for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
        if (ibuf_layouts[i].get_linear_size() == 0)
            continue;
-        intermediates_memory.push_back(allocate_internal_buffer(i));
+        intermediates_memory.push_back(allocate_internal_buffer(i, reset));
        max_intermediates_memory_sizes.push_back(intermediates_memory[i]->size());
    }
    _intermediates_memory = intermediates_memory;
--- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp
@@ -146,7 +146,13 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
        layout(ov::PartialShape{ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}, ov::Dimension{1, 10}},
               data_types::f32,
               format::bfyx);
-    network network(engine, topology(input_layout("input", in_layout), softmax("softmax", input_info("input"), 3)), get_test_default_config(engine));
+    auto config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology(input_layout("input", in_layout),
+                                     reorder("reorder", input_info("input"), format::bfyx, data_types::f16),
+                                     softmax("softmax", input_info("reorder"), 3),
+                                     reorder("reorder2", input_info("softmax"), format::bfyx, data_types::f32)),
+                                     config);

    // First run
    float out_buffer_1[out_size_1];
@@ -186,6 +192,9 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
    ASSERT_EQ(internal_mems_1.size(), internal_mems_2.size());
    for (size_t i = 0; i < internal_mems_1.size(); ++i) {
        ASSERT_EQ(internal_mems_1[i]->buffer_ptr(), internal_mems_2[i]->buffer_ptr());
+        if (engine.get_device_info().supports_immad) {
+            ASSERT_EQ(internal_mems_1[i]->get_allocation_type(), allocation_type::usm_device);
+        }
    }
 }