[GPU] Release unused internal memory from pool (#18917)

* * Not to reuse internal memory for dynamic shape because of the current inefficiency in the pool * Added a new debug config for dump runtime memory pool * Apply DisableMemoryReuse for all usages * Resolved perf issue of memory reuse from pool : Previously original ibuf record was not released when we allocate new memory for that buf. After releasing the memory, # of the memory pool record does not increase => no longer inefficient memory pool retireval. * Added test
2023-08-01 20:25:05 -07:00 · 2023-08-01 20:25:05 -07:00 · db8c29ee6f
commit db8c29ee6f
parent 0e13b99a3d
9 changed files with 122 additions and 37 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -114,6 +114,7 @@ public:
    int dump_layers_limit_batch;                    // Limit the size of batch to dump
    int dump_layers_raw;                            // Dump raw data.
    int dump_layers_binary;                         // Dump binary data.
+    int dump_runtime_memory_pool;                   // Dump memory pool status at each iteration
    int base_batch_for_memory_estimation;           // Base batch size to be used in memory estimation
    std::vector<std::string> after_proc;            // Start inference after the listed processes
    int serialize_compile;                          // Serialize creating primitives and compiling kernels
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
@ -126,6 +126,12 @@ public:
                                             allocation_type type);
    void clear_pool_for_network(uint32_t network_id);
    void release_memory(memory* memory, const primitive_id& id, uint32_t network_id);
+
+    size_t get_non_padded_pool_size() {
+        return _non_padded_pool.size();
+    }
+
+    void dump(uint32_t id);
 };

 }  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@ -236,7 +236,7 @@ public:
    static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
            bool is_internal, size_t idx = 0, bool reset_mem = true, bool is_output_buffer = false, memory* curr_memory = nullptr, bool runtime_alloc = false);

-    std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
+    std::vector<memory::ptr> get_intermediates_memories() const { return _intermediates_memory; }

    virtual void save(cldnn::BinaryOutputBuffer& ob) const;
    virtual void load(cldnn::BinaryInputBuffer& ib);
@ -307,7 +307,7 @@ protected:
    // depending on reshape_node.is_in_place())
    std::vector<memory::ptr> _outputs;

-    std::vector<memory::cptr> _intermediates_memory;
+    std::vector<memory::ptr> _intermediates_memory;

    mutable LruCache<layout, memory::ptr, layout::Hasher> _reordered_weights_cache;

--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -1172,10 +1172,21 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:

 void network::execute_impl(const std::vector<event::ptr>& events) {
    OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute");
+    int64_t curr_iter = -1;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+#ifdef GPU_DEBUG_CONFIG
+    curr_iter = iteration++;
+#endif
+
    // Wait for previous execution completion
    reset_execution(false);
-    GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl;
-    GPU_DEBUG_TRACE << "Start network execution" << std::endl;
+    GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
+        GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
+        GPU_DEBUG_COUT << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
+    } else {
+        GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl;
+        GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
+    }

    std::vector<memory::ptr> in_out_mem;
    auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
@ -1211,7 +1222,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
    auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());

    set_arguments();
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->list_layers == 1) {
        for (auto& inst : _exec_order) {
            GPU_DEBUG_COUT << inst->id() << std::endl;
@ -1225,12 +1235,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
        }
        if (!is_internal()) exit(0);
    }
-    int64_t curr_iter = -1;
-#ifdef GPU_DEBUG_CONFIG
-    GPU_DEBUG_IF(!debug_config->dump_iteration.empty()) {
-        curr_iter = iteration++;
-    }
-#endif
    auto get_iteration_prefix = [](int64_t iter) {
        if (iter < 0)
            return std::string("");
@ -1435,6 +1439,10 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
    // provide proper event to execution. Flushing pipeline should prevent this kind of issues.
    // In scenarios with a big number of very small networks it can provide performance drop.
    get_stream().flush();
+
+    GPU_DEBUG_IF(debug_config->dump_runtime_memory_pool > 0) {
+        get_memory_pool().dump(get_id());
+    }
 }

 std::vector<primitive_id> network::get_input_ids() const {
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -133,6 +133,26 @@ bool is_any_user_cpu(const std::list<const program_node*>& users) {
    return false;
 }

+static memory::ptr get_memory_from_pool(engine& _engine,
+                                uint32_t net_id,
+                                memory_pool& pool,
+                                const program_node& _node,
+                                const layout& layout,
+                                allocation_type type,
+                                bool reusable_across_network,
+                                bool reset = true,
+                                memory* curr_memory = nullptr) {
+    OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(),
+                    "[GPU] Can't allocate output for dynamic layout without upper bound");
+    // Use layout with max tensor for dynamic shape with upper bound
+    if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
+        if (curr_memory != nullptr)
+            pool.release_memory(curr_memory, _node.id(), net_id);
+        return pool.get_memory(layout, _node.id(), net_id, _node.get_memory_dependencies(), type, reusable_across_network, reset);
+    }
+    return pool.get_memory(layout, type, reset);
+}
+
 std::shared_ptr<kernel_impl_params> primitive_impl::get_weights_reorder_kernel_params() const {
    if (!need_weights_reorder())
        return nullptr;
@ -1002,9 +1022,19 @@ memory::ptr primitive_inst::allocate_internal_buffer(size_t idx, bool reset) {
    GPU_DEBUG_LOG << "=> allocate to " << alloc_type << std::endl;

    // Reuse intermediate buffer like output buffer.
-    auto ret_mem = _network.get_memory_pool().get_memory(layout, _node->id(), _network.get_id(), _node->get_memory_dependencies(), alloc_type, true, reset);
+    bool reuse_internal_buf = true;
+    auto ret_mem =
+        get_memory_from_pool(_network.get_engine(),
+                             _network.get_id(),
+                             _network.get_memory_pool(),
+                             *_node,
+                             layout,
+                             alloc_type,
+                             reuse_internal_buf,
+                             reset,
+                             _intermediates_memory.size() > idx ? _intermediates_memory[idx].get() : nullptr);
    GPU_DEBUG_LOG << " [" << _network.get_id() << ":" << _node->id() << ": internal buf " << idx << "] " << alloc_type
-        << " " << ret_mem->buffer_ptr() << std::endl;
+                  << " " << ret_mem->buffer_ptr() << std::endl;
    return ret_mem;
 }

@ -1016,7 +1046,7 @@ void primitive_inst::allocate_internal_buffers(bool reset) {
        return;

    // allocate intermediate memory for the updated layout of buffer
-    std::vector<memory::cptr> intermediates_memory;
+    std::vector<memory::ptr> intermediates_memory;
    for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
        if (ibuf_layouts[i].get_linear_size() == 0)
            continue;
@ -1150,18 +1180,6 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {

 memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
                                uint32_t net_id, bool is_internal, size_t idx, bool reset, bool is_output_buffer, memory* curr_memory, bool runtime_alloc) {
-    auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
-            allocation_type type, bool reusable_across_network, bool reset = true, memory* curr_memory = nullptr) {
-        OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
-        // Use layout with max tensor for dynamic shape with upper bound
-        if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
-            if (curr_memory != nullptr)
-                pool.release_memory(curr_memory, id, net_id);
-            return pool.get_memory(layout, id, net_id, dependencies, type, reusable_across_network, reset);
-        }
-        return pool.get_memory(layout, type, reset);
-    };
-
    auto layout = impl_params.get_output_layout(idx);
    OPENVINO_ASSERT(layout.is_static() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout");
    auto device_mem_acc = [&](size_t a, const cldnn::layout& l) {
@ -1187,10 +1205,6 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
    if (_node.is_in_shape_of_subgraph())
        reusable_across_network = false;

-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->disable_memory_reuse) {
-        reusable_across_network = false;
-    }
    // For outputs, cpu prim we want to have lockable alloc type
    // Also if the successor of a node is an cpu, then memory needs to be lockable.
    bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false;
@ -1213,9 +1227,10 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
                _engine.supports_allocation(allocation_type::usm_device))
                alloc_type = allocation_type::usm_device;
            return get_memory_from_pool(_engine,
+                                        net_id,
+                                        pool,
+                                        _node,
                                        layout,
-                                        _node.id(),
-                                        _node.get_memory_dependencies(),
                                        alloc_type,
                                        false,
                                        reset,
@ -1231,9 +1246,10 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
        return _engine.allocate_memory(layout, alloc_type, reset);
    } else {
        return get_memory_from_pool(_engine,
+                                    net_id,
+                                    pool,
+                                    _node,
                                    layout,
-                                    _node.id(),
-                                    _node.get_memory_dependencies(),
                                    alloc_type,
                                    reusable_across_network,
                                    reset,
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -182,6 +182,7 @@ debug_configuration::debug_configuration()
        , dump_layers_limit_batch(std::numeric_limits<int>::max())
        , dump_layers_raw(0)
        , dump_layers_binary(0)
+        , dump_runtime_memory_pool(0)
        , base_batch_for_memory_estimation(-1)
        , serialize_compile(0)
        , max_kernels_per_batch(0)
@ -210,6 +211,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DisableOnednnOptPostOps", disable_onednn_opt_post_ops);
    get_gpu_debug_env_var("DumpProfilingData", dump_profiling_data);
    get_gpu_debug_env_var("DryRunPath", dry_run_path);
+    get_gpu_debug_env_var("DumpRuntimeMemoryPool", dump_runtime_memory_pool);
    get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);
    std::string dump_layers_str;
    get_gpu_debug_env_var("DumpLayers", dump_layers_str);
--- a/src/plugins/intel_gpu/src/runtime/layout.cpp
+++ b/src/plugins/intel_gpu/src/runtime/layout.cpp
@ -215,7 +215,7 @@ std::string layout::to_short_string() const {
    dump_shape(s, size);

    if (data_padding.get_dynamic_pad_dims() != tensor(0)) {
-        s << ":dyn_pad_dims" << data_padding.get_dynamic_pad_dims().to_string();
+        s << ":dyn_pad_dims";
    } else {
        if (data_padding)
            s << ":pad";
--- a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@ -223,7 +223,12 @@ memory::ptr memory_pool::get_memory(const layout& layout,
                                    allocation_type type,
                                    bool reusable_across_network,
                                    bool reset) {
-    if (reusable_across_network) {
+    bool do_reuse = reusable_across_network;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->disable_memory_reuse) {
+        do_reuse = false;
+    }
+    if (do_reuse) {
        // reusable within the same network
        if (!layout.format.is_image() && layout.data_padding == padding{{0, 0, 0, 0}, 0}) {
            // non-padded buffers
@ -298,4 +303,25 @@ void memory_pool::clear_pool_for_network(uint32_t network_id) {

 memory_pool::memory_pool(engine& engine) : _engine(&engine) { }

+void memory_pool::dump(uint32_t net_id) {
+    GPU_DEBUG_COUT << "Dump memory pool of network " << net_id << std::endl;
+    GPU_DEBUG_COUT << "========== non-padded pool ( " << _non_padded_pool.size() << " records) ==========" << std::endl;
+    for (auto mem : _non_padded_pool) {
+        GPU_DEBUG_COUT << mem.second._memory->buffer_ptr() << " (size: " << mem.first << ", type: " << mem.second._type
+                  << ")'s users: " << std::endl;
+        for (auto user : mem.second._users) {
+            GPU_DEBUG_COUT << "   -- " << user._id << std::endl;
+        }
+    }
+    GPU_DEBUG_COUT << "========== padded pool (" << _padded_pool.size() << " records) ==========" << std::endl;
+    for (auto mem : _padded_pool) {
+        GPU_DEBUG_COUT << " layout: " << mem.first.to_short_string() << std::endl;
+        for (auto record : mem.second) {
+            GPU_DEBUG_COUT << "    " << record._memory->buffer_ptr() << ", type: " << record._type << ", users : " << std::endl;
+            for (auto user : record._users) {
+                GPU_DEBUG_COUT << "    --- " << user._id << std::endl;
+            }
+        }
+    }
+}
 }  // namespace cldnn
--- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/memory_realloc_test.cpp
@ -131,7 +131,10 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
        out_size_1  = output_x_1 * output_b_1,
        output_x_2  = 10, output_b_2  = 4,
        input_x_2   = 10, input_b_2  = 4,
-        out_size_2  = output_x_2 * output_b_2;
+        out_size_2  = output_x_2 * output_b_2,
+        output_x_3  = 10, output_b_3  = 16,
+        input_x_3   = 10, input_b_3  = 16,
+        out_size_3  = output_x_3 * output_b_3;

    cldnn::engine& engine = get_test_engine();

@ -196,6 +199,29 @@ TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
            ASSERT_EQ(internal_mems_1[i]->get_allocation_type(), allocation_type::usm_device);
        }
    }
+    // Third run
+    float out_buffer_3[out_size_3];
+    std::vector<float> in_b_3(out_size_3, 2.0f);
+    std::vector<float> expected_buffer_3(out_size_3, 0.1f);
+    cldnn::memory::ptr input_3 = engine.allocate_memory({ data_types::f32, format::bfyx, {input_b_3, 1, input_x_3, 1}});
+    set_values(input_3, in_b_3);
+    network.set_input_data("input", input_3);
+    auto outputs_3 = network.execute();
+    auto output_mem_3 = outputs_3.begin()->second.get_memory();
+    cldnn::mem_lock<float> output_ptr_3(output_mem_3, get_test_stream());
+    for (uint32_t i = 0; i < out_size_3; i++) {
+        out_buffer_3[i] = output_ptr_3[i];
+    }
+    compare_out_buffer_with_expected(out_buffer_3, expected_buffer_3, out_size_3);
+    auto internal_mems_3 = network.get_primitive("softmax")->get_intermediates_memories();
+    for (size_t i = 0; i < internal_mems_3.size(); ++i) {
+        if (engine.get_device_info().supports_immad) {
+            ASSERT_EQ(internal_mems_3[i]->get_allocation_type(), allocation_type::usm_device);
+        }
+    }
+    auto& pool = network.get_memory_pool();
+    // check if previously allocated internal buffer is released
+    ASSERT_EQ(pool.get_non_padded_pool_size(), 3);
 }

 TEST(dyn_shape_mem_test, igpu_shape_infer_dep_mem_type) {