From 8a1b63ec51c4f2162c3bf528fb7006613dbea6b1 Mon Sep 17 00:00:00 2001 From: Vladislav Volkov Date: Wed, 1 Dec 2021 12:56:29 +0300 Subject: [PATCH] Memory leaks in tbbbind and onednn were fixed (#8825) --- inference-engine/thirdparty/CMakeLists.txt | 6 +- .../threading/ie_parallel_custom_arena.cpp | 178 ++++++++++-------- 2 files changed, 108 insertions(+), 76 deletions(-) diff --git a/inference-engine/thirdparty/CMakeLists.txt b/inference-engine/thirdparty/CMakeLists.txt index 72ddf521d6f..8eade257b15 100644 --- a/inference-engine/thirdparty/CMakeLists.txt +++ b/inference-engine/thirdparty/CMakeLists.txt @@ -29,7 +29,11 @@ endif() function(ie_add_mkldnn) set(DNNL_ENABLE_JIT_PROFILING ${BUILD_SHARED_LIBS} CACHE BOOL "" FORCE) - set(DNNL_ENABLE_ITT_TASKS ${BUILD_SHARED_LIBS} CACHE BOOL "" FORCE) + if(BUILD_SHARED_LIBS AND ENABLE_PROFILING_ITT) + set(DNNL_ENABLE_ITT_TASKS ON CACHE BOOL "" FORCE) + else() + set(DNNL_ENABLE_ITT_TASKS OFF CACHE BOOL "" FORCE) + endif() set(DNNL_ENABLE_CONCURRENT_EXEC ON CACHE BOOL "" FORCE) set(DNNL_ENABLE_PRIMITIVE_CACHE OFF CACHE BOOL "" FORCE) ## TODO: try it later set(DNNL_ENABLE_MAX_CPU_ISA ON CACHE BOOL "" FORCE) diff --git a/src/inference/src/threading/ie_parallel_custom_arena.cpp b/src/inference/src/threading/ie_parallel_custom_arena.cpp index 6e05539d401..a987351bad2 100644 --- a/src/inference/src/threading/ie_parallel_custom_arena.cpp +++ b/src/inference/src/threading/ie_parallel_custom_arena.cpp @@ -28,6 +28,7 @@ void __TBB_internal_initialize_system_topology(std::size_t groups_num, int*& numa_indexes_list, int& core_types_count, int*& core_types_indexes_list); +void __TBB_internal_destroy_system_topology(); binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, @@ -38,25 +39,6 @@ void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core); } -static int get_processors_group_num() { -# if defined(_WIN32) || defined(_WIN64) - SYSTEM_INFO si; - GetNativeSystemInfo(&si); - - DWORD_PTR pam, sam, m = 1; - GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam); - int nproc = 0; - for (std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1) { - if (pam & m) - ++nproc; - } - if (nproc == static_cast(si.dwNumberOfProcessors)) { - return GetActiveProcessorGroupCount(); - } -# endif - return 1; -} - static bool is_binding_environment_valid() { # if defined(_WIN32) && !defined(_WIN64) static bool result = [] { @@ -73,37 +55,116 @@ static bool is_binding_environment_valid() { # endif /* _WIN32 && !_WIN64 */ } -static int numa_nodes_count = 0; -static int* numa_nodes_indexes = nullptr; +# endif -static int core_types_count = 0; -static int* core_types_indexes = nullptr; - -static void initialize_system_topology() { - static std::once_flag is_topology_initialized; - - std::call_once(is_topology_initialized, [&] { +class TBBbindSystemTopology { + TBBbindSystemTopology() { +# if USE_TBBBIND_2_5 if (is_binding_environment_valid()) { __TBB_internal_initialize_system_topology(get_processors_group_num(), numa_nodes_count, numa_nodes_indexes, core_types_count, core_types_indexes); - } else { - static int dummy_index = task_arena::automatic; - - numa_nodes_count = 1; - numa_nodes_indexes = &dummy_index; - - core_types_count = 1; - core_types_indexes = &dummy_index; } - }); +# endif + } + +public: + ~TBBbindSystemTopology() { +# if USE_TBBBIND_2_5 + if (is_binding_environment_valid()) { + __TBB_internal_destroy_system_topology(); + } +# endif + } + + std::vector numa_nodes() const { +# if USE_TBBBIND_2_5 + std::vector node_indexes(numa_nodes_count); + std::memcpy(node_indexes.data(), numa_nodes_indexes, numa_nodes_count * sizeof(int)); + return node_indexes; +# elif TBB_NUMA_SUPPORT_PRESENT + return tbb::info::numa_nodes(); +# else + return {tbb::task_arena::automatic}; +# endif + } + + std::vector core_types() const { +# if USE_TBBBIND_2_5 + std::vector core_type_indexes(core_types_count); + std::memcpy(core_type_indexes.data(), core_types_indexes, core_types_count * sizeof(int)); + return core_type_indexes; +# elif TBB_HYBRID_CPUS_SUPPORT_PRESENT + return tbb::info::core_types(); +# else + return {tbb::task_arena::automatic}; +# endif + } + + int default_concurrency(task_arena::constraints c) const { + if (c.max_concurrency > 0) { + return c.max_concurrency; + } +# if USE_TBBBIND_2_5 + if (is_binding_environment_valid()) { + return __TBB_internal_get_default_concurrency(c.numa_id, c.core_type, c.max_threads_per_core); + } + return tbb::this_task_arena::max_concurrency(); +# elif TBB_HYBRID_CPUS_SUPPORT_PRESENT + return tbb::info::default_concurrency(convert_constraints(c)); +# elif TBB_NUMA_SUPPORT_PRESENT + return tbb::info::default_concurrency(c.numa_id); +# else + return tbb::this_task_arena::max_concurrency(); +# endif + } + + friend const TBBbindSystemTopology& system_topology(); + +private: + int get_processors_group_num() const { +# if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetNativeSystemInfo(&si); + + DWORD_PTR pam, sam, m = 1; + GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam); + int nproc = 0; + for (std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1) { + if (pam & m) + ++nproc; + } + if (nproc == static_cast(si.dwNumberOfProcessors)) { + return GetActiveProcessorGroupCount(); + } +# endif + return 1; + } + +private: +# if USE_TBBBIND_2_5 + int dummy_index = task_arena::automatic; + + int numa_nodes_count = 1; + int* numa_nodes_indexes = &dummy_index; + + int core_types_count = 1; + int* core_types_indexes = &dummy_index; +# endif +}; + +const TBBbindSystemTopology& system_topology() { + static TBBbindSystemTopology topology; + return topology; } +# if USE_TBBBIND_2_5 + binding_observer::binding_observer(tbb::task_arena& ta, int num_slots, const constraints& c) : task_scheduler_observer(ta) { - detail::initialize_system_topology(); + detail::system_topology(); my_binding_handler = detail::__TBB_internal_allocate_binding_handler(num_slots, c.numa_id, c.core_type, c.max_threads_per_core); } @@ -219,52 +280,19 @@ int task_arena::max_concurrency() { namespace info { std::vector numa_nodes() { -# if USE_TBBBIND_2_5 - detail::initialize_system_topology(); - std::vector node_indexes(detail::numa_nodes_count); - std::memcpy(node_indexes.data(), detail::numa_nodes_indexes, detail::numa_nodes_count * sizeof(int)); - return node_indexes; -# elif TBB_NUMA_SUPPORT_PRESENT - return tbb::info::numa_nodes(); -# else - return {tbb::task_arena::automatic}; -# endif + return detail::system_topology().numa_nodes(); } std::vector core_types() { -# if USE_TBBBIND_2_5 - detail::initialize_system_topology(); - std::vector core_type_indexes(detail::core_types_count); - std::memcpy(core_type_indexes.data(), detail::core_types_indexes, detail::core_types_count * sizeof(int)); - return core_type_indexes; -# elif TBB_HYBRID_CPUS_SUPPORT_PRESENT - return tbb::info::core_types(); -# else - return {tbb::task_arena::automatic}; -# endif + return detail::system_topology().core_types(); } int default_concurrency(task_arena::constraints c) { - if (c.max_concurrency > 0) { - return c.max_concurrency; - } -# if USE_TBBBIND_2_5 - if (detail::is_binding_environment_valid()) { - detail::initialize_system_topology(); - return detail::__TBB_internal_get_default_concurrency(c.numa_id, c.core_type, c.max_threads_per_core); - } - return tbb::this_task_arena::max_concurrency(); -# elif TBB_HYBRID_CPUS_SUPPORT_PRESENT - return tbb::info::default_concurrency(convert_constraints(c)); -# elif TBB_NUMA_SUPPORT_PRESENT - return tbb::info::default_concurrency(c.numa_id); -# else - return tbb::this_task_arena::max_concurrency(); -# endif + return detail::system_topology().default_concurrency(c); } int default_concurrency(numa_node_id id) { - return default_concurrency(task_arena::constraints{}.set_numa_id(id)); + return detail::system_topology().default_concurrency(task_arena::constraints{}.set_numa_id(id)); } } // namespace info