From 8a1b63ec51c4f2162c3bf528fb7006613dbea6b1 Mon Sep 17 00:00:00 2001
From: Vladislav Volkov <vladislav.volkov@intel.com>
Date: Wed, 1 Dec 2021 12:56:29 +0300
Subject: [PATCH] Memory leaks in tbbbind and onednn were fixed (#8825)

---
 inference-engine/thirdparty/CMakeLists.txt    |   6 +-
 .../threading/ie_parallel_custom_arena.cpp    | 178 ++++++++++--------
 2 files changed, 108 insertions(+), 76 deletions(-)
diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt
index 72ddf521d6f..8eade257b15 100644
--- a/inference-engine/thirdparty/CMakeLists.txt
+++ b/inference-engine/thirdparty/CMakeLists.txt
@@ -29,7 +29,11 @@ endif()
 
 function(ie_add_mkldnn)
     set(DNNL_ENABLE_JIT_PROFILING ${BUILD_SHARED_LIBS} CACHE BOOL "" FORCE)
-    set(DNNL_ENABLE_ITT_TASKS ${BUILD_SHARED_LIBS} CACHE BOOL "" FORCE)
+    if(BUILD_SHARED_LIBS AND ENABLE_PROFILING_ITT)
+        set(DNNL_ENABLE_ITT_TASKS ON CACHE BOOL "" FORCE)
+    else()
+        set(DNNL_ENABLE_ITT_TASKS OFF CACHE BOOL "" FORCE)
+    endif()
     set(DNNL_ENABLE_CONCURRENT_EXEC ON CACHE BOOL "" FORCE)
     set(DNNL_ENABLE_PRIMITIVE_CACHE OFF CACHE BOOL "" FORCE) ## TODO: try it later
     set(DNNL_ENABLE_MAX_CPU_ISA ON CACHE BOOL "" FORCE)
diff --git a/src/inference/src/threading/ie_parallel_custom_arena.cpp b/src/inference/src/threading/ie_parallel_custom_arena.cpp
index 6e05539d401..a987351bad2 100644
--- a/src/inference/src/threading/ie_parallel_custom_arena.cpp
+++ b/src/inference/src/threading/ie_parallel_custom_arena.cpp
@@ -28,6 +28,7 @@ void __TBB_internal_initialize_system_topology(std::size_t groups_num,
                                                int*& numa_indexes_list,
                                                int& core_types_count,
                                                int*& core_types_indexes_list);
+void __TBB_internal_destroy_system_topology();
 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots,
                                                          int numa_id,
                                                          int core_type_id,
@@ -38,25 +39,6 @@ void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num)
 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core);
 }
 
-static int get_processors_group_num() {
-#        if defined(_WIN32) || defined(_WIN64)
-    SYSTEM_INFO si;
-    GetNativeSystemInfo(&si);
-
-    DWORD_PTR pam, sam, m = 1;
-    GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
-    int nproc = 0;
-    for (std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1) {
-        if (pam & m)
-            ++nproc;
-    }
-    if (nproc == static_cast<int>(si.dwNumberOfProcessors)) {
-        return GetActiveProcessorGroupCount();
-    }
-#        endif
-    return 1;
-}
-
 static bool is_binding_environment_valid() {
 #        if defined(_WIN32) && !defined(_WIN64)
     static bool result = [] {
@@ -73,37 +55,116 @@ static bool is_binding_environment_valid() {
 #        endif /* _WIN32 && !_WIN64 */
 }
 
-static int numa_nodes_count = 0;
-static int* numa_nodes_indexes = nullptr;
+#    endif
 
-static int core_types_count = 0;
-static int* core_types_indexes = nullptr;
-
-static void initialize_system_topology() {
-    static std::once_flag is_topology_initialized;
-
-    std::call_once(is_topology_initialized, [&] {
+class TBBbindSystemTopology {
+    TBBbindSystemTopology() {
+#    if USE_TBBBIND_2_5
         if (is_binding_environment_valid()) {
             __TBB_internal_initialize_system_topology(get_processors_group_num(),
                                                       numa_nodes_count,
                                                       numa_nodes_indexes,
                                                       core_types_count,
                                                       core_types_indexes);
-        } else {
-            static int dummy_index = task_arena::automatic;
-
-            numa_nodes_count = 1;
-            numa_nodes_indexes = &dummy_index;
-
-            core_types_count = 1;
-            core_types_indexes = &dummy_index;
         }
-    });
+#    endif
+    }
+
+public:
+    ~TBBbindSystemTopology() {
+#    if USE_TBBBIND_2_5
+        if (is_binding_environment_valid()) {
+            __TBB_internal_destroy_system_topology();
+        }
+#    endif
+    }
+
+    std::vector<numa_node_id> numa_nodes() const {
+#    if USE_TBBBIND_2_5
+        std::vector<numa_node_id> node_indexes(numa_nodes_count);
+        std::memcpy(node_indexes.data(), numa_nodes_indexes, numa_nodes_count * sizeof(int));
+        return node_indexes;
+#    elif TBB_NUMA_SUPPORT_PRESENT
+        return tbb::info::numa_nodes();
+#    else
+        return {tbb::task_arena::automatic};
+#    endif
+    }
+
+    std::vector<core_type_id> core_types() const {
+#    if USE_TBBBIND_2_5
+        std::vector<numa_node_id> core_type_indexes(core_types_count);
+        std::memcpy(core_type_indexes.data(), core_types_indexes, core_types_count * sizeof(int));
+        return core_type_indexes;
+#    elif TBB_HYBRID_CPUS_SUPPORT_PRESENT
+        return tbb::info::core_types();
+#    else
+        return {tbb::task_arena::automatic};
+#    endif
+    }
+
+    int default_concurrency(task_arena::constraints c) const {
+        if (c.max_concurrency > 0) {
+            return c.max_concurrency;
+        }
+#    if USE_TBBBIND_2_5
+        if (is_binding_environment_valid()) {
+            return __TBB_internal_get_default_concurrency(c.numa_id, c.core_type, c.max_threads_per_core);
+        }
+        return tbb::this_task_arena::max_concurrency();
+#    elif TBB_HYBRID_CPUS_SUPPORT_PRESENT
+        return tbb::info::default_concurrency(convert_constraints(c));
+#    elif TBB_NUMA_SUPPORT_PRESENT
+        return tbb::info::default_concurrency(c.numa_id);
+#    else
+        return tbb::this_task_arena::max_concurrency();
+#    endif
+    }
+
+    friend const TBBbindSystemTopology& system_topology();
+
+private:
+    int get_processors_group_num() const {
+#    if defined(_WIN32) || defined(_WIN64)
+        SYSTEM_INFO si;
+        GetNativeSystemInfo(&si);
+
+        DWORD_PTR pam, sam, m = 1;
+        GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+        int nproc = 0;
+        for (std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1) {
+            if (pam & m)
+                ++nproc;
+        }
+        if (nproc == static_cast<int>(si.dwNumberOfProcessors)) {
+            return GetActiveProcessorGroupCount();
+        }
+#    endif
+        return 1;
+    }
+
+private:
+#    if USE_TBBBIND_2_5
+    int dummy_index = task_arena::automatic;
+
+    int numa_nodes_count = 1;
+    int* numa_nodes_indexes = &dummy_index;
+
+    int core_types_count = 1;
+    int* core_types_indexes = &dummy_index;
+#    endif
+};
+
+const TBBbindSystemTopology& system_topology() {
+    static TBBbindSystemTopology topology;
+    return topology;
 }
 
+#    if USE_TBBBIND_2_5
+
 binding_observer::binding_observer(tbb::task_arena& ta, int num_slots, const constraints& c)
     : task_scheduler_observer(ta) {
-    detail::initialize_system_topology();
+    detail::system_topology();
     my_binding_handler =
         detail::__TBB_internal_allocate_binding_handler(num_slots, c.numa_id, c.core_type, c.max_threads_per_core);
 }
@@ -219,52 +280,19 @@ int task_arena::max_concurrency() {
 
 namespace info {
 std::vector<numa_node_id> numa_nodes() {
-#    if USE_TBBBIND_2_5
-    detail::initialize_system_topology();
-    std::vector<numa_node_id> node_indexes(detail::numa_nodes_count);
-    std::memcpy(node_indexes.data(), detail::numa_nodes_indexes, detail::numa_nodes_count * sizeof(int));
-    return node_indexes;
-#    elif TBB_NUMA_SUPPORT_PRESENT
-    return tbb::info::numa_nodes();
-#    else
-    return {tbb::task_arena::automatic};
-#    endif
+    return detail::system_topology().numa_nodes();
 }
 
 std::vector<core_type_id> core_types() {
-#    if USE_TBBBIND_2_5
-    detail::initialize_system_topology();
-    std::vector<numa_node_id> core_type_indexes(detail::core_types_count);
-    std::memcpy(core_type_indexes.data(), detail::core_types_indexes, detail::core_types_count * sizeof(int));
-    return core_type_indexes;
-#    elif TBB_HYBRID_CPUS_SUPPORT_PRESENT
-    return tbb::info::core_types();
-#    else
-    return {tbb::task_arena::automatic};
-#    endif
+    return detail::system_topology().core_types();
 }
 
 int default_concurrency(task_arena::constraints c) {
-    if (c.max_concurrency > 0) {
-        return c.max_concurrency;
-    }
-#    if USE_TBBBIND_2_5
-    if (detail::is_binding_environment_valid()) {
-        detail::initialize_system_topology();
-        return detail::__TBB_internal_get_default_concurrency(c.numa_id, c.core_type, c.max_threads_per_core);
-    }
-    return tbb::this_task_arena::max_concurrency();
-#    elif TBB_HYBRID_CPUS_SUPPORT_PRESENT
-    return tbb::info::default_concurrency(convert_constraints(c));
-#    elif TBB_NUMA_SUPPORT_PRESENT
-    return tbb::info::default_concurrency(c.numa_id);
-#    else
-    return tbb::this_task_arena::max_concurrency();
-#    endif
+    return detail::system_topology().default_concurrency(c);
 }
 
 int default_concurrency(numa_node_id id) {
-    return default_concurrency(task_arena::constraints{}.set_numa_id(id));
+    return detail::system_topology().default_concurrency(task_arena::constraints{}.set_numa_id(id));
 }
 
 }  // namespace info