From c33856b31f76f22dd7c8cad5c75967a371ca700b Mon Sep 17 00:00:00 2001
From: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
Date: Thu, 9 Sep 2021 13:19:07 +0900
Subject: [PATCH] [GPU] Improve memory usage management to distinguish
 allocation type (#7318)

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
---
 .../clDNN/api/cldnn/runtime/engine.hpp        | 21 +++++----
 .../thirdparty/clDNN/runtime/engine.cpp       | 47 +++++++++++++++----
 .../thirdparty/clDNN/runtime/memory.cpp       | 30 ++++++------
 .../clDNN/tests/test_cases/memory_test.cpp    |  2 +-
 4 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
index 8aa53a14fe2..fb79a20a785 100644
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/engine.hpp
@@ -96,17 +96,20 @@ public:
     /// Returns user context handle which was used to create the engine
     virtual void* get_user_context() const = 0;
 
-    /// Returns the maximum amount of GPU memory that engine allocated in current process
+    /// Returns the total maximum amount of GPU memory allocated by engine in current process for all allocation types
     uint64_t get_max_used_device_memory() const;
 
-    /// Returns the amount of GPU memory currently used by the engine
-    uint64_t get_used_device_memory() const;
+    /// Returns the maximum amount of GPU memory allocated by engine in current process for the specified allocation @p type
+    uint64_t get_max_used_device_memory(allocation_type type) const;
 
-    /// Adds @p bytes count to currently used memory size
-    void add_memory_used(uint64_t bytes);
+    /// Returns the amount of GPU memory specified allocation @p type that currently used by the engine
+    uint64_t get_used_device_memory(allocation_type type) const;
 
-    /// Subtracts @p bytes count from currently used memory size
-    void subtract_memory_used(uint64_t bytes);
+    /// Adds @p bytes count to currently used memory size of the specified allocation @p type
+    void add_memory_used(uint64_t bytes, allocation_type type);
+
+    /// Subtracts @p bytes count from currently used memory size of the specified allocation @p type
+    void subtract_memory_used(uint64_t bytes, allocation_type type);
 
     /// Returns true if USM is enabled in engine config and device/driver supports required features
     bool use_unified_shared_memory() const;
@@ -142,8 +145,8 @@ protected:
     const device::ptr _device;
     engine_configuration _configuration;
 
-    std::atomic<uint64_t> memory_usage = {0};
-    std::atomic<uint64_t> peak_memory_usage = {0};
+    std::map<allocation_type, std::atomic<uint64_t>> memory_usage_map;
+    std::map<allocation_type, std::atomic<uint64_t>> peak_memory_usage_map;
 };
 
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/runtime/engine.cpp b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
index 976e7bae595..3738ec2ae9d 100644
--- a/inference-engine/thirdparty/clDNN/runtime/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/engine.cpp
@@ -120,22 +120,51 @@ memory_ptr engine::share_surface(const layout& layout, shared_surface surf, uint
 #endif  // _WIN32
 
 uint64_t engine::get_max_used_device_memory() const {
-    return peak_memory_usage.load();
+    uint64_t total_peak_memory_usage {0};
+    for (auto const& m : peak_memory_usage_map) {
+        total_peak_memory_usage += m.second.load();
+    }
+    return total_peak_memory_usage;
 }
 
-uint64_t engine::get_used_device_memory() const {
-    return memory_usage.load();
+uint64_t engine::get_max_used_device_memory(allocation_type type) const {
+    uint64_t peak_memory_usage {0};
+    auto iter = peak_memory_usage_map.find(type);
+    if (iter != peak_memory_usage_map.end()) {
+        peak_memory_usage = iter->second.load();
+    }
+    return peak_memory_usage;
 }
 
-void engine::add_memory_used(size_t bytes) {
-    memory_usage += bytes;
-    if (memory_usage > peak_memory_usage) {
-        peak_memory_usage = memory_usage.load();
+uint64_t engine::get_used_device_memory(allocation_type type) const {
+    uint64_t memory_usage {0};
+    auto iter = memory_usage_map.find(type);
+    if (iter != memory_usage_map.end()) {
+        memory_usage = iter->second.load();
+    }
+    return memory_usage;
+}
+
+void engine::add_memory_used(size_t bytes, allocation_type type) {
+    if (!memory_usage_map.count(type) && !peak_memory_usage_map.count(type)) {
+        static std::mutex m;
+        std::lock_guard<std::mutex> guard(m);
+        memory_usage_map[type] = 0;
+        peak_memory_usage_map[type] = 0;
+    }
+    memory_usage_map[type] += bytes;
+    if (memory_usage_map[type] > peak_memory_usage_map[type]) {
+        peak_memory_usage_map[type] = memory_usage_map[type].load();
     }
 }
 
-void engine::subtract_memory_used(size_t bytes) {
-    memory_usage -= bytes;
+void engine::subtract_memory_used(size_t bytes, allocation_type type) {
+    auto iter = memory_usage_map.find(type);
+    if (iter != memory_usage_map.end()) {
+        memory_usage_map[type] -= bytes;
+    } else {
+        throw std::runtime_error("Attempt to free unallocated memory");
+    }
 }
 
 std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
diff --git a/inference-engine/thirdparty/clDNN/runtime/memory.cpp b/inference-engine/thirdparty/clDNN/runtime/memory.cpp
index 80a6ee980ed..9a22d3a2ae9 100644
--- a/inference-engine/thirdparty/clDNN/runtime/memory.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/memory.cpp
@@ -20,27 +20,25 @@ namespace cldnn {
 memory::memory(engine* engine, const layout& layout, allocation_type type, bool reused)
     : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), _type(type), _reused(reused) {
     if (!_reused && _engine) {
-        _engine->add_memory_used(_bytes_count);
-    }
-
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 1) {
-        GPU_DEBUG_COUT << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
-                       << " (current=" << _engine->get_used_device_memory() << ";"
-                       << " max=" << _engine->get_max_used_device_memory() << ")" << std::endl;
+        _engine->add_memory_used(_bytes_count, type);
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->verbose >= 1) {
+            GPU_DEBUG_COUT << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
+                           << " (current=" << _engine->get_used_device_memory(type) << ";"
+                           << " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
+        }
     }
 }
 
 memory::~memory() {
     if (!_reused && _engine) {
-        _engine->subtract_memory_used(_bytes_count);
-    }
-
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 1) {
-        GPU_DEBUG_COUT << "Free " << _bytes_count << " bytes"
-                       << " (current=" << _engine->get_used_device_memory() << ";"
-                       << " max=" << _engine->get_max_used_device_memory() << ")" << std::endl;
+        _engine->subtract_memory_used(_bytes_count, _type);
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->verbose >= 1) {
+            GPU_DEBUG_COUT << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
+                           << " (current=" << _engine->get_used_device_memory(_type) << ";"
+                           << " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
+        }
     }
 }
 
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
index 4582f2ad063..e5e8bd01f09 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/memory_test.cpp
@@ -403,7 +403,7 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
     network network_second(*engine, topo, bo);
     network_second.set_input_data("input", input_1);
     auto outputs_second = network_second.execute();
-    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)3928);
+    EXPECT_EQ(engine->get_max_used_device_memory(), (uint64_t)4328);
 }
 
 TEST(memory_pool, shared_dep_two_output) {