From 05dc0c8cf74d975e18099524664b082ac24fad9b Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Fri, 30 Apr 2021 00:27:05 +0900
Subject: [PATCH] [IE CLDNN] WA for memory increase problem of parallel build
 for OCL (#5389)

In linux, without malloc_trim, some freed memories are not being returned to system.
Current hypothesis is that a large allocation for compilation is not completely freeed, thought mostly freed.
This does not happendin Windows.
So, added malloc_trim for linux build until we figure out a better solution.
---
 .../clDNN/src/gpu/kernels_cache.cpp           | 35 ++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
index 7631afc0ccb..1451d68de5f 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
@@ -24,6 +24,9 @@
 #include <queue>
 #include <condition_variable>
 #endif
+#if defined(__unix__) && !defined(__ANDROID__)
+#include <malloc.h>
+#endif
 
 #ifndef ENABLE_UNICODE_PATH_SUPPORT
 # ifdef _WIN32
@@ -237,14 +240,6 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
 }
 
 kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {
-#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
-    int n_threads = _context.get_configuration().n_threads;
-    arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
-    arena->initialize(n_threads);
-#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
-    int n_threads = _context.get_configuration().n_threads;
-    pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
-#endif
 }
 
 kernels_cache::kernel_id kernels_cache::set_kernel_source(
@@ -406,6 +401,14 @@ void kernels_cache::build_all() {
         std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
         get_program_source(_kernels_code, &batches);
         _one_time_kernels.clear();
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+        int n_threads = _context.get_configuration().n_threads;
+        arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
+        arena->initialize(n_threads);
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+        int n_threads = _context.get_configuration().n_threads;
+        pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
+#endif
     }
 
 #if (CLDNN_THREADING == CLDNN_THREADING_TBB)
@@ -435,6 +438,22 @@ void kernels_cache::build_all() {
         std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
         _kernels_code.clear();
         _pending_compilation = false;
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+        arena.reset();
+#if defined(__unix__) && !defined(__ANDROID__)
+    //  NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed.
+    //  (It is at least 500 MB when we perform parallel compilation)
+    //  It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory.
+    //  Also, this is not happening in Windows.
+    //  So, added malloc_trim for linux build until we figure out a better solution.
+        malloc_trim(0);
+#endif
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+        pool.reset();
+#if defined(__unix__) && !defined(__ANDROID__)
+        malloc_trim(0);
+#endif
+#endif
     }
 }