diff --git a/cmake/templates/openvino.pc.in b/cmake/templates/openvino.pc.in index 45ccf0490b5..e0715362c3e 100644 --- a/cmake/templates/openvino.pc.in +++ b/cmake/templates/openvino.pc.in @@ -17,6 +17,6 @@ Description: OpenVINO™ Toolkit URL: https://docs.openvino.ai/latest/index.html Version: @OpenVINO_VERSION@ Conflicts: openvino < @OpenVINO_VERSION@ -Cflags: -I${includedir_old} -I${includedir_new} +Cflags: -I${includedir_old} -I${includedir_new} @PKGCONFIG_OpenVINO_DEFINITIONS@ Libs: -L${libdir} @PKGCONFIG_OpenVINO_FRONTENDS@ -lopenvino_c -lopenvino @PKGCONFIG_OpenVINO_PRIVATE_DEPS@ Libs.private: -ldl -lm -lpthread -lrt diff --git a/src/cmake/ie_parallel.cmake b/src/cmake/ie_parallel.cmake index 81e691880ce..bbfc7240f92 100644 --- a/src/cmake/ie_parallel.cmake +++ b/src/cmake/ie_parallel.cmake @@ -224,10 +224,12 @@ function(set_ie_threading_interface_for TARGET_NAME) endfunction() set(IE_THREAD_DEFINE "IE_THREAD_SEQ") + set(OV_THREAD_DEFINE "OV_THREAD_SEQ") if (THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO") if (TBB_FOUND) set(IE_THREAD_DEFINE "IE_THREAD_TBB") + set(OV_THREAD_DEFINE "OV_THREAD_TBB") ie_target_link_libraries(${TARGET_NAME} ${LINK_TYPE} TBB::tbb) target_compile_definitions(${TARGET_NAME} ${COMPILE_DEF_TYPE} TBB_PREVIEW_WAITING_FOR_WORKERS=1) else () @@ -273,6 +275,7 @@ function(set_ie_threading_interface_for TARGET_NAME) set(THREADING "SEQ" PARENT_SCOPE) else () set(IE_THREAD_DEFINE "IE_THREAD_OMP") + set(OV_THREAD_DEFINE "OV_THREAD_OMP") if (WIN32) target_compile_options(${TARGET_NAME} ${LINK_TYPE} ${OpenMP_CXX_FLAGS} /openmp) @@ -300,7 +303,8 @@ function(set_ie_threading_interface_for TARGET_NAME) endif () endif () - target_compile_definitions(${TARGET_NAME} ${LINK_TYPE} -DIE_THREAD=${IE_THREAD_DEFINE}) + target_compile_definitions(${TARGET_NAME} ${COMPILE_DEF_TYPE} -DIE_THREAD=${IE_THREAD_DEFINE}) + target_compile_definitions(${TARGET_NAME} ${COMPILE_DEF_TYPE} -DOV_THREAD=${OV_THREAD_DEFINE}) if (NOT THREADING STREQUAL "SEQ") find_package(Threads REQUIRED) diff --git a/src/cmake/openvino.cmake b/src/cmake/openvino.cmake index 6f68dc4192e..3e3d702d716 100644 --- a/src/cmake/openvino.cmake +++ b/src/cmake/openvino.cmake @@ -195,6 +195,12 @@ install(FILES "${CMAKE_BINARY_DIR}/share/OpenVINOConfig.cmake" # Generate and install openvino.pc pkg-config file if(ENABLE_PKGCONFIG_GEN) + # fill in PKGCONFIG_OpenVINO_DEFINITIONS + get_target_property(openvino_defs openvino INTERFACE_COMPILE_DEFINITIONS) + foreach(openvino_def IN LISTS openvino_defs) + set(PKGCONFIG_OpenVINO_DEFINITIONS "${PKGCONFIG_OpenVINO_DEFINITIONS} -D${openvino_def}") + endforeach() + # fill in PKGCONFIG_OpenVINO_FRONTENDS get_target_property(PKGCONFIG_OpenVINO_FRONTENDS_LIST ov_frontends MANUALLY_ADDED_DEPENDENCIES) if(ENABLE_OV_IR_FRONTEND) diff --git a/src/core/include/openvino/core/parallel.hpp b/src/core/include/openvino/core/parallel.hpp new file mode 100644 index 00000000000..eefe5c641a0 --- /dev/null +++ b/src/core/include/openvino/core/parallel.hpp @@ -0,0 +1,705 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @brief Contains declarations and definitions for sequential and multi-threading implementations. + * + * Multi-threading support is implemented in two variants: using the Threading Building Blocks library and OpenMP* + * product. To build a particular implementation, use the corresponding identifier: OV_THREAD_TBB, OV_THREAD_TBB_AUTO, + * OV_THREAD_OMP or OV_THREAD_SEQ. + * + * @file parallel.hpp + */ + +#pragma once + +#include +#include + +#define OV_THREAD_TBB 0 +#define OV_THREAD_OMP 1 +#define OV_THREAD_SEQ 2 +#define OV_THREAD_TBB_AUTO 3 + +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) +# ifndef NOMINMAX +# define NOMINMAX +# endif +# ifndef TBB_PREVIEW_LOCAL_OBSERVER +# define TBB_PREVIEW_LOCAL_OBSERVER 1 +# endif +# ifndef TBB_PREVIEW_WAITING_FOR_WORKERS +# define TBB_PREVIEW_WAITING_FOR_WORKERS 1 +# endif +# ifndef TBB_PREVIEW_NUMA_SUPPORT +# define TBB_PREVIEW_NUMA_SUPPORT 1 +# endif +# ifndef TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION +# define TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION 1 +# endif + +# include "tbb/blocked_range.h" +# include "tbb/blocked_range2d.h" +# include "tbb/blocked_range3d.h" +# include "tbb/parallel_for.h" +# include "tbb/parallel_reduce.h" +# include "tbb/parallel_sort.h" +# include "tbb/task_arena.h" +# include "tbb/task_scheduler_observer.h" + +inline int parallel_get_max_threads() { + return tbb::this_task_arena::max_concurrency(); +} +inline int parallel_get_num_threads() { + return parallel_get_max_threads(); +} +inline int parallel_get_thread_num() { + return tbb::this_task_arena::current_thread_index(); +} +inline void parallel_set_num_threads(int) { + return; +} +inline int parallel_get_env_threads() { + return 0; +} +# if OV_THREAD == OV_THREAD_TBB +# define PARTITIONING , tbb::static_partitioner() + +// The TBB version less than 2018u1 has no static_partitioner argument for +// tbb::parallel_deterministic_reduce. So will fallback to non deterministic version. +# if (TBB_INTERFACE_VERSION >= 10001) +# define _TBB_REDUCE_FUNC tbb::parallel_deterministic_reduce +# else +# define _TBB_REDUCE_FUNC tbb::parallel_reduce +# endif + +# else +# define PARTITIONING +# endif +#elif OV_THREAD == OV_THREAD_OMP +# include + +# include +# include +# include + +/* MSVC still supports omp 2.0 only */ +# if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +# define collapse(x) +# endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER) +inline int parallel_get_max_threads() { + return omp_get_max_threads(); +} +inline int parallel_get_num_threads() { + return omp_get_num_threads(); +} +inline int parallel_get_thread_num() { + return omp_get_thread_num(); +} +inline void parallel_set_num_threads(int n) { + omp_set_num_threads(n); +} +inline int parallel_get_env_threads() { + int env_cores = 0; + if (getenv("OMP_NUM_THREADS") != nullptr) { + try { + env_cores = std::stoi(getenv("OMP_NUM_THREADS")); + } catch (const std::exception&) { + env_cores = 0; + } + } + return env_cores; +} + +#elif OV_THREAD == OV_THREAD_SEQ +# include +inline int parallel_get_env_threads() { + return 1; +} +inline int parallel_get_max_threads() { + return 1; +} +inline int parallel_get_num_threads() { + return 1; +} +inline int parallel_get_thread_num() { + return 0; +} +inline void parallel_set_num_threads(int) { + return; +} +#endif + +namespace ov { + +template +void parallel_nt(int nthr, const F& func) { +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + if (nthr == 0) + nthr = parallel_get_max_threads(); + if (nthr == 1) { + func(0, 1); + return; + } + + tbb::parallel_for(0, nthr, [&](int ithr) { + func(ithr, nthr); + }); +#elif OV_THREAD == OV_THREAD_OMP + if (nthr == 1) { + func(0, 1); + return; + } + +# pragma omp parallel num_threads(nthr) + func(parallel_get_thread_num(), parallel_get_num_threads()); +#elif OV_THREAD == OV_THREAD_SEQ + func(0, 1); +#endif +} + +template +void parallel_nt_static(int nthr, const F& func) { +#if OV_THREAD == OV_THREAD_SEQ + const bool serial = true; +#else + const bool serial = false; +#endif + + if (serial || nthr == 1) { + func(0, 1); + return; + } + + if (nthr == 0) + nthr = parallel_get_max_threads(); +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + func(ithr, nthr); + }, + tbb::static_partitioner{}); + +#elif OV_THREAD == OV_THREAD_OMP + +# pragma omp parallel num_threads(nthr) + { func(parallel_get_thread_num(), parallel_get_num_threads()); } +#endif +} + +template +void parallel_sort(I begin, I end, const F& comparator) { +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + tbb::parallel_sort(begin, end, comparator); +#elif OV_THREAD == OV_THREAD_OMP + // TODO: propose OpenMP version + std::sort(begin, end, comparator); +#elif OV_THREAD == OV_THREAD_SEQ + std::sort(begin, end, comparator); +#endif +} + +template +R parallel_sum(const T0& D0, const R& input, const F& func) { +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + return _TBB_REDUCE_FUNC( + tbb::blocked_range(0, D0), + input, + [&](const tbb::blocked_range& r, R init) -> R { + R sum = init; + for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) + sum += func(dim1); + return sum; + }, + [](R x, R y) -> R { + return x + y; + } PARTITIONING); +#else + R sum = input; + +# ifdef _MSC_VER + using T0_IT = typename std::make_signed::type; +# else + using T0_IT = T0; +# endif + +# if OV_THREAD == OV_THREAD_OMP +# pragma omp parallel for reduction(+ : sum) schedule(static) +# endif + for (T0_IT dim1 = 0; dim1 < static_cast(D0); dim1++) { + sum += static_cast(func(dim1)); + } + return sum; +#endif +} + +template +R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) { +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + return _TBB_REDUCE_FUNC( + tbb::blocked_range2d(0, D0, 0, D1), + input, + [&](const tbb::blocked_range2d& r, R init) -> R { + R sum = init; + for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) { + for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) { + sum += func(dim2, dim1); + } + } + return sum; + }, + [](R x, R y) -> R { + return x + y; + } PARTITIONING); +#else + R sum = input; + +# ifdef _MSC_VER + using T0_IT = typename std::make_signed::type; + using T1_IT = typename std::make_signed::type; +# else + using T0_IT = T0; + using T1_IT = T1; +# endif + +# if OV_THREAD == OV_THREAD_OMP +# pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static) +# endif + for (T0_IT dim2 = 0; dim2 < D0; dim2++) { + for (T1_IT dim1 = 0; dim1 < D1; dim1++) { + sum += func(dim2, dim1); + } + } + return sum; +#endif +} +template +R parallel_sum3d(const T0& D0, const T1& D1, const T2& D2, const R& input, const F& func) { +#if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) + return _TBB_REDUCE_FUNC( + tbb::blocked_range3d(0, D0, 0, D1, 0, D2), + input, + [&](const tbb::blocked_range3d& r, R init) -> R { + R sum = init; + for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) { + for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) { + for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) { + sum += func(dim1, dim2, dim3); + } + } + } + return sum; + }, + [](R x, R y) -> R { + return x + y; + } PARTITIONING); +#else + R sum = input; + +# ifdef _MSC_VER + using T0_IT = typename std::make_signed::type; + using T1_IT = typename std::make_signed::type; + using T2_IT = typename std::make_signed::type; +# else + using T0_IT = T0; + using T1_IT = T1; + using T2_IT = T2; +# endif + +# if OV_THREAD == OV_THREAD_OMP +# pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static) +# endif + for (T0_IT dim1 = 0; dim1 < static_cast(D0); dim1++) { + for (T1_IT dim2 = 0; dim2 < static_cast(D1); dim2++) { + for (T2_IT dim3 = 0; dim3 < static_cast(D2); dim3++) { + sum += func(dim1, dim2, dim3); + } + } + } + return sum; +#endif +} + +template +inline T parallel_it_init(T start) { + return start; +} +template +inline T parallel_it_init(T start, Q& x, const R& X, Args&&... tuple) { + start = parallel_it_init(start, static_cast(tuple)...); + x = start % X; + return start / X; +} + +inline bool parallel_it_step() { + return true; +} +template +inline bool parallel_it_step(Q& x, const R& X, Args&&... tuple) { + if (parallel_it_step(static_cast(tuple)...)) { + if (++x - X == 0) { + x = 0; + return true; + } + } + return false; +} + +template +inline void splitter(const T& n, const Q& team, const Q& tid, T& n_start, T& n_end) { + if (team <= 1 || n == 0) { + n_start = 0; + n_end = n; + } else { + T n1 = (n + (T)team - 1) / (T)team; + T n2 = n1 - 1; + T T1 = n - n2 * (T)team; + n_end = (T)tid < T1 ? n1 : n2; + n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2; + } + + n_end += n_start; +} + +namespace helpers { +template +struct NumOfLambdaArgs : public NumOfLambdaArgs {}; + +template +struct NumOfLambdaArgs { + constexpr static int value = sizeof...(Args); +}; + +template ::value> +typename std::enable_if::type call_with_args(const ACT& body, + size_t g_id, + size_t iwork, + T... arg) { + body(g_id, iwork, arg...); +} + +template ::value> +typename std::enable_if::type call_with_args(const ACT& body, + size_t g_id, + size_t iwork, + T... arg) { + body(g_id, arg...); +} + +template ::value> +typename std::enable_if::type call_with_args(const ACT& body, + size_t g_id, + size_t iwork, + T... arg) { + body(arg...); +} +} // namespace helpers + +template +void for_1d(const int& ithr, const int& nthr, const T0& D0, const F& func) { + T0 d0{0}, end{0}; + splitter(D0, nthr, ithr, d0, end); + for (; d0 < end; ++d0) + helpers::call_with_args(func, ithr, d0, d0); +} + +template +void parallel_for(const T0& D0, const F& func) { +#if OV_THREAD == OV_THREAD_TBB + auto work_amount = static_cast(D0); + int nthr = parallel_get_max_threads(); + if (static_cast(nthr) > work_amount) + nthr = static_cast(work_amount); + if (nthr == 1) { + for_1d(0, 1, D0, func); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + for_1d(ithr, nthr, D0, func); + }, + tbb::static_partitioner()); + } +#elif OV_THREAD == OV_THREAD_TBB_AUTO + const int nthr = parallel_get_max_threads(); + tbb::parallel_for(0, nthr, [&](int ithr) { + for_1d(ithr, nthr, D0, func); + }); +#elif OV_THREAD == OV_THREAD_OMP +# pragma omp parallel + for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func); +#elif OV_THREAD == OV_THREAD_SEQ + for_1d(0, 1, D0, func); +#endif +} + +template +void for_2d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const F& func) { + const size_t work_amount = (size_t)D0 * D1; + if (work_amount == 0) + return; + size_t start{0}, end{0}; + splitter(work_amount, nthr, ithr, start, end); + + T0 d0{0}; + T1 d1{0}; + parallel_it_init(start, d0, D0, d1, D1); + for (size_t iwork = start; iwork < end; ++iwork) { + helpers::call_with_args(func, ithr, iwork, d0, d1); + parallel_it_step(d0, D0, d1, D1); + } +} + +template +void parallel_for2d(const T0& D0, const T1& D1, const F& func) { +#if OV_THREAD == OV_THREAD_TBB + auto work_amount = static_cast(D0 * D1); + int nthr = parallel_get_max_threads(); + if (static_cast(nthr) > work_amount) + nthr = static_cast(work_amount); + if (nthr == 1) { + for_2d(0, 1, D0, D1, func); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + for_2d(ithr, nthr, D0, D1, func); + }, + tbb::static_partitioner()); + } +#elif OV_THREAD == OV_THREAD_TBB_AUTO + const int nthr = parallel_get_max_threads(); + tbb::parallel_for(0, nthr, [&](int ithr) { + for_2d(ithr, nthr, D0, D1, func); + }); +#elif OV_THREAD == OV_THREAD_OMP +# pragma omp parallel + for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func); +#elif OV_THREAD == OV_THREAD_SEQ + for_2d(0, 1, D0, D1, func); +#endif +} + +template +void for_3d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const F& func) { + const size_t work_amount = (size_t)D0 * D1 * D2; + if (work_amount == 0) + return; + size_t start{0}, end{0}; + splitter(work_amount, nthr, ithr, start, end); + + T0 d0{0}; + T1 d1{0}; + T2 d2{0}; + parallel_it_init(start, d0, D0, d1, D1, d2, D2); + for (size_t iwork = start; iwork < end; ++iwork) { + helpers::call_with_args(func, ithr, iwork, d0, d1, d2); + parallel_it_step(d0, D0, d1, D1, d2, D2); + } +} + +template +void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) { +#if OV_THREAD == OV_THREAD_TBB + auto work_amount = static_cast(D0 * D1 * D2); + int nthr = parallel_get_max_threads(); + if (static_cast(nthr) > work_amount) + nthr = static_cast(work_amount); + if (nthr == 1) { + for_3d(0, 1, D0, D1, D2, func); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + for_3d(ithr, nthr, D0, D1, D2, func); + }, + tbb::static_partitioner()); + } +#elif OV_THREAD == OV_THREAD_TBB_AUTO + const int nthr = parallel_get_max_threads(); + tbb::parallel_for(0, nthr, [&](int ithr) { + for_3d(ithr, nthr, D0, D1, D2, func); + }); +#elif OV_THREAD == OV_THREAD_OMP +# pragma omp parallel + for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func); +#elif OV_THREAD == OV_THREAD_SEQ + for_3d(0, 1, D0, D1, D2, func); +#endif +} + +template +void for_4d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) { + const size_t work_amount = (size_t)D0 * D1 * D2 * D3; + if (work_amount == 0) + return; + size_t start{0}, end{0}; + splitter(work_amount, nthr, ithr, start, end); + + T0 d0{0}; + T1 d1{0}; + T2 d2{0}; + T3 d3{0}; + parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3); + for (size_t iwork = start; iwork < end; ++iwork) { + helpers::call_with_args(func, ithr, iwork, d0, d1, d2, d3); + parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3); + } +} + +template +void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) { +#if OV_THREAD == OV_THREAD_TBB + auto work_amount = static_cast(D0 * D1 * D2 * D3); + int nthr = parallel_get_max_threads(); + if (static_cast(nthr) > work_amount) + nthr = static_cast(work_amount); + if (nthr == 1) { + for_4d(0, 1, D0, D1, D2, D3, func); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + for_4d(ithr, nthr, D0, D1, D2, D3, func); + }, + tbb::static_partitioner()); + } +#elif OV_THREAD == OV_THREAD_TBB_AUTO + const int nthr = parallel_get_max_threads(); + tbb::parallel_for(0, nthr, [&](int ithr) { + for_4d(ithr, nthr, D0, D1, D2, D3, func); + }); +#elif OV_THREAD == OV_THREAD_OMP +# pragma omp parallel + for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func); +#elif OV_THREAD == OV_THREAD_SEQ + for_4d(0, 1, D0, D1, D2, D3, func); +#endif +} + +template +void for_5d(const int& ithr, + const int& nthr, + const T0& D0, + const T1& D1, + const T2& D2, + const T3& D3, + const T4& D4, + const F& func) { + const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4; + if (work_amount == 0) + return; + size_t start{0}, end{0}; + splitter(work_amount, nthr, ithr, start, end); + + T0 d0{0}; + T1 d1{0}; + T2 d2{0}; + T3 d3{0}; + T4 d4{0}; + parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); + for (size_t iwork = start; iwork < end; ++iwork) { + helpers::call_with_args(func, ithr, iwork, d0, d1, d2, d3, d4); + parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); + } +} + +template +void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) { +#if OV_THREAD == OV_THREAD_TBB + auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4); + int nthr = parallel_get_max_threads(); + if (static_cast(nthr) > work_amount) + nthr = static_cast(work_amount); + if (nthr == 1) { + for_5d(0, 1, D0, D1, D2, D3, D4, func); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + for_5d(ithr, nthr, D0, D1, D2, D3, D4, func); + }, + tbb::static_partitioner()); + } +#elif OV_THREAD == OV_THREAD_TBB_AUTO + const int nthr = parallel_get_max_threads(); + tbb::parallel_for(0, nthr, [&](int ithr) { + for_5d(ithr, nthr, D0, D1, D2, D3, D4, func); + }); +#elif OV_THREAD == OV_THREAD_OMP +# pragma omp parallel + for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func); +#elif OV_THREAD == OV_THREAD_SEQ + for_5d(0, 1, D0, D1, D2, D3, D4, func); +#endif +} + +template +void for_6d(const int& ithr, + const int& nthr, + const T0& D0, + const T1& D1, + const T2& D2, + const T3& D3, + const T4& D4, + const T5& D5, + const F& func) { + const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5; + if (work_amount == 0) + return; + size_t start{0}, end{0}; + splitter(work_amount, nthr, ithr, start, end); + + T0 d0{0}; + T1 d1{0}; + T2 d2{0}; + T3 d3{0}; + T4 d4{0}; + T5 d5{0}; + parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); + for (size_t iwork = start; iwork < end; ++iwork) { + helpers::call_with_args(func, ithr, iwork, d0, d1, d2, d3, d4, d5); + parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); + } +} + +template +void parallel_for6d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const T5& D5, const F& func) { +#if OV_THREAD == OV_THREAD_TBB + auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4 * D5); + int nthr = parallel_get_max_threads(); + if (static_cast(nthr) > work_amount) + nthr = static_cast(work_amount); + if (nthr == 1) { + for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); + } else { + tbb::parallel_for( + 0, + nthr, + [&](int ithr) { + for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func); + }, + tbb::static_partitioner()); + } +#elif OV_THREAD == OV_THREAD_TBB_AUTO + const int nthr = parallel_get_max_threads(); + tbb::parallel_for(0, nthr, [&](int ithr) { + for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func); + }); +#elif OV_THREAD == OV_THREAD_OMP +# pragma omp parallel + for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func); +#elif OV_THREAD == OV_THREAD_SEQ + for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); +#endif +} + +} // namespace ov diff --git a/src/inference/include/ie/ie_parallel.hpp b/src/inference/include/ie/ie_parallel.hpp index db27cec9630..de80952ecc9 100644 --- a/src/inference/include/ie/ie_parallel.hpp +++ b/src/inference/include/ie/ie_parallel.hpp @@ -14,692 +14,35 @@ #pragma once -#include -#include +#include "openvino/core/parallel.hpp" -#define IE_THREAD_TBB 0 -#define IE_THREAD_OMP 1 -#define IE_THREAD_SEQ 2 -#define IE_THREAD_TBB_AUTO 3 - -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) -# ifndef NOMINMAX -# define NOMINMAX -# endif -# ifndef TBB_PREVIEW_LOCAL_OBSERVER -# define TBB_PREVIEW_LOCAL_OBSERVER 1 -# endif -# ifndef TBB_PREVIEW_WAITING_FOR_WORKERS -# define TBB_PREVIEW_WAITING_FOR_WORKERS 1 -# endif -# ifndef TBB_PREVIEW_NUMA_SUPPORT -# define TBB_PREVIEW_NUMA_SUPPORT 1 -# endif -# ifndef TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION -# define TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION 1 -# endif - -# include "tbb/blocked_range.h" -# include "tbb/blocked_range2d.h" -# include "tbb/blocked_range3d.h" -# include "tbb/parallel_for.h" -# include "tbb/parallel_reduce.h" -# include "tbb/parallel_sort.h" -# include "tbb/task_arena.h" -# include "tbb/task_scheduler_observer.h" - -inline int parallel_get_max_threads() { - return tbb::this_task_arena::max_concurrency(); -} -inline int parallel_get_num_threads() { - return parallel_get_max_threads(); -} -inline int parallel_get_thread_num() { - return tbb::this_task_arena::current_thread_index(); -} -inline void parallel_set_num_threads(int) { - return; -} -inline int parallel_get_env_threads() { - return 0; -} -# if IE_THREAD == IE_THREAD_TBB -# define PARTITIONING , tbb::static_partitioner() - -// The TBB version less than 2018u1 has no static_partitioner argument for -// tbb::parallel_deterministic_reduce. So will fallback to non deterministic version. -# if (TBB_INTERFACE_VERSION >= 10001) -# define _TBB_REDUCE_FUNC tbb::parallel_deterministic_reduce -# else -# define _TBB_REDUCE_FUNC tbb::parallel_reduce -# endif - -# else -# define PARTITIONING -# endif -#elif IE_THREAD == IE_THREAD_OMP -# include - -# include -# include -# include - -/* MSVC still supports omp 2.0 only */ -# if defined(_MSC_VER) && !defined(__INTEL_COMPILER) -# define collapse(x) -# endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER) -inline int parallel_get_max_threads() { - return omp_get_max_threads(); -} -inline int parallel_get_num_threads() { - return omp_get_num_threads(); -} -inline int parallel_get_thread_num() { - return omp_get_thread_num(); -} -inline void parallel_set_num_threads(int n) { - omp_set_num_threads(n); -} -inline int parallel_get_env_threads() { - int env_cores = 0; - if (getenv("OMP_NUM_THREADS") != nullptr) { - try { - env_cores = std::stoi(getenv("OMP_NUM_THREADS")); - } catch (const std::exception&) { - env_cores = 0; - } - } - return env_cores; -} - -#elif IE_THREAD == IE_THREAD_SEQ -# include -inline int parallel_get_env_threads() { - return 1; -} -inline int parallel_get_max_threads() { - return 1; -} -inline int parallel_get_num_threads() { - return 1; -} -inline int parallel_get_thread_num() { - return 0; -} -inline void parallel_set_num_threads(int) { - return; -} -#endif +#define IE_THREAD_TBB OV_THREAD_TBB +#define IE_THREAD_OMP OV_THREAD_OMP +#define IE_THREAD_SEQ OV_THREAD_SEQ +#define IE_THREAD_TBB_AUTO OV_THREAD_TBB_AUTO namespace InferenceEngine { -template -void parallel_nt(int nthr, const F& func) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - if (nthr == 0) - nthr = parallel_get_max_threads(); - if (nthr == 1) { - func(0, 1); - return; - } - - tbb::parallel_for(0, nthr, [&](int ithr) { - func(ithr, nthr); - }); -#elif IE_THREAD == IE_THREAD_OMP - if (nthr == 1) { - func(0, 1); - return; - } - -# pragma omp parallel num_threads(nthr) - func(parallel_get_thread_num(), parallel_get_num_threads()); -#elif IE_THREAD == IE_THREAD_SEQ - func(0, 1); -#endif -} - -template -void parallel_nt_static(int nthr, const F& func) { -#if IE_THREAD == IE_THREAD_SEQ - const bool serial = true; -#else - const bool serial = false; -#endif - - if (serial || nthr == 1) { - func(0, 1); - return; - } - - if (nthr == 0) - nthr = parallel_get_max_threads(); -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - func(ithr, nthr); - }, - tbb::static_partitioner{}); - -#elif IE_THREAD == IE_THREAD_OMP - -# pragma omp parallel num_threads(nthr) - { func(parallel_get_thread_num(), parallel_get_num_threads()); } -#endif -} - -template -void parallel_sort(I begin, I end, const F& comparator) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - tbb::parallel_sort(begin, end, comparator); -#elif IE_THREAD == IE_THREAD_OMP - // TODO: propose OpenMP version - std::sort(begin, end, comparator); -#elif IE_THREAD == IE_THREAD_SEQ - std::sort(begin, end, comparator); -#endif -} - -template -R parallel_sum(const T0& D0, const R& input, const F& func) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - return _TBB_REDUCE_FUNC( - tbb::blocked_range(0, D0), - input, - [&](const tbb::blocked_range& r, R init) -> R { - R sum = init; - for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1) - sum += func(dim1); - return sum; - }, - [](R x, R y) -> R { - return x + y; - } PARTITIONING); -#else - R sum = input; - -# ifdef _MSC_VER - using T0_IT = typename std::make_signed::type; -# else - using T0_IT = T0; -# endif - -# if IE_THREAD == IE_THREAD_OMP -# pragma omp parallel for reduction(+ : sum) schedule(static) -# endif - for (T0_IT dim1 = 0; dim1 < static_cast(D0); dim1++) { - sum += static_cast(func(dim1)); - } - return sum; -#endif -} - -template -R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - return _TBB_REDUCE_FUNC( - tbb::blocked_range2d(0, D0, 0, D1), - input, - [&](const tbb::blocked_range2d& r, R init) -> R { - R sum = init; - for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) { - for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) { - sum += func(dim2, dim1); - } - } - return sum; - }, - [](R x, R y) -> R { - return x + y; - } PARTITIONING); -#else - R sum = input; - -# ifdef _MSC_VER - using T0_IT = typename std::make_signed::type; - using T1_IT = typename std::make_signed::type; -# else - using T0_IT = T0; - using T1_IT = T1; -# endif - -# if IE_THREAD == IE_THREAD_OMP -# pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static) -# endif - for (T0_IT dim2 = 0; dim2 < D0; dim2++) { - for (T1_IT dim1 = 0; dim1 < D1; dim1++) { - sum += func(dim2, dim1); - } - } - return sum; -#endif -} -template -R parallel_sum3d(const T0& D0, const T1& D1, const T2& D2, const R& input, const F& func) { -#if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - return _TBB_REDUCE_FUNC( - tbb::blocked_range3d(0, D0, 0, D1, 0, D2), - input, - [&](const tbb::blocked_range3d& r, R init) -> R { - R sum = init; - for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) { - for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) { - for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) { - sum += func(dim1, dim2, dim3); - } - } - } - return sum; - }, - [](R x, R y) -> R { - return x + y; - } PARTITIONING); -#else - R sum = input; - -# ifdef _MSC_VER - using T0_IT = typename std::make_signed::type; - using T1_IT = typename std::make_signed::type; - using T2_IT = typename std::make_signed::type; -# else - using T0_IT = T0; - using T1_IT = T1; - using T2_IT = T2; -# endif - -# if IE_THREAD == IE_THREAD_OMP -# pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static) -# endif - for (T0_IT dim1 = 0; dim1 < static_cast(D0); dim1++) { - for (T1_IT dim2 = 0; dim2 < static_cast(D1); dim2++) { - for (T2_IT dim3 = 0; dim3 < static_cast(D2); dim3++) { - sum += func(dim1, dim2, dim3); - } - } - } - return sum; -#endif -} - -template -inline T parallel_it_init(T start) { - return start; -} -template -inline T parallel_it_init(T start, Q& x, const R& X, Args&&... tuple) { - start = parallel_it_init(start, static_cast(tuple)...); - x = start % X; - return start / X; -} - -inline bool parallel_it_step() { - return true; -} -template -inline bool parallel_it_step(Q& x, const R& X, Args&&... tuple) { - if (parallel_it_step(static_cast(tuple)...)) { - if (++x - X == 0) { - x = 0; - return true; - } - } - return false; -} - -template -inline void splitter(const T& n, const Q& team, const Q& tid, T& n_start, T& n_end) { - if (team <= 1 || n == 0) { - n_start = 0; - n_end = n; - } else { - T n1 = (n + (T)team - 1) / (T)team; - T n2 = n1 - 1; - T T1 = n - n2 * (T)team; - n_end = (T)tid < T1 ? n1 : n2; - n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2; - } - - n_end += n_start; -} - -namespace details { -template -struct num_of_lambda_args : public num_of_lambda_args {}; - -template -struct num_of_lambda_args { - constexpr static int value = sizeof...(Args); -}; - -template ::value> -typename std::enable_if::type call_with_args(const ACT& body, - size_t g_id, - size_t iwork, - T... arg) { - body(g_id, iwork, arg...); -} - -template ::value> -typename std::enable_if::type call_with_args(const ACT& body, - size_t g_id, - size_t iwork, - T... arg) { - body(g_id, arg...); -} - -template ::value> -typename std::enable_if::type call_with_args(const ACT& body, - size_t g_id, - size_t iwork, - T... arg) { - body(arg...); -} -} // namespace details - -template -void for_1d(const int& ithr, const int& nthr, const T0& D0, const F& func) { - T0 d0{0}, end{0}; - splitter(D0, nthr, ithr, d0, end); - for (; d0 < end; ++d0) - details::call_with_args(func, ithr, d0, d0); -} - -template -void parallel_for(const T0& D0, const F& func) { -#if IE_THREAD == IE_THREAD_TBB - auto work_amount = static_cast(D0); - int nthr = parallel_get_max_threads(); - if (static_cast(nthr) > work_amount) - nthr = static_cast(work_amount); - if (nthr == 1) { - for_1d(0, 1, D0, func); - } else { - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - for_1d(ithr, nthr, D0, func); - }, - tbb::static_partitioner()); - } -#elif IE_THREAD == IE_THREAD_TBB_AUTO - const int nthr = parallel_get_max_threads(); - tbb::parallel_for(0, nthr, [&](int ithr) { - for_1d(ithr, nthr, D0, func); - }); -#elif IE_THREAD == IE_THREAD_OMP -# pragma omp parallel - for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func); -#elif IE_THREAD == IE_THREAD_SEQ - for_1d(0, 1, D0, func); -#endif -} - -template -void for_2d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const F& func) { - const size_t work_amount = (size_t)D0 * D1; - if (work_amount == 0) - return; - size_t start{0}, end{0}; - splitter(work_amount, nthr, ithr, start, end); - - T0 d0{0}; - T1 d1{0}; - parallel_it_init(start, d0, D0, d1, D1); - for (size_t iwork = start; iwork < end; ++iwork) { - details::call_with_args(func, ithr, iwork, d0, d1); - parallel_it_step(d0, D0, d1, D1); - } -} - -template -void parallel_for2d(const T0& D0, const T1& D1, const F& func) { -#if IE_THREAD == IE_THREAD_TBB - auto work_amount = static_cast(D0 * D1); - int nthr = parallel_get_max_threads(); - if (static_cast(nthr) > work_amount) - nthr = static_cast(work_amount); - if (nthr == 1) { - for_2d(0, 1, D0, D1, func); - } else { - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - for_2d(ithr, nthr, D0, D1, func); - }, - tbb::static_partitioner()); - } -#elif IE_THREAD == IE_THREAD_TBB_AUTO - const int nthr = parallel_get_max_threads(); - tbb::parallel_for(0, nthr, [&](int ithr) { - for_2d(ithr, nthr, D0, D1, func); - }); -#elif IE_THREAD == IE_THREAD_OMP -# pragma omp parallel - for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func); -#elif IE_THREAD == IE_THREAD_SEQ - for_2d(0, 1, D0, D1, func); -#endif -} - -template -void for_3d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const F& func) { - const size_t work_amount = (size_t)D0 * D1 * D2; - if (work_amount == 0) - return; - size_t start{0}, end{0}; - splitter(work_amount, nthr, ithr, start, end); - - T0 d0{0}; - T1 d1{0}; - T2 d2{0}; - parallel_it_init(start, d0, D0, d1, D1, d2, D2); - for (size_t iwork = start; iwork < end; ++iwork) { - details::call_with_args(func, ithr, iwork, d0, d1, d2); - parallel_it_step(d0, D0, d1, D1, d2, D2); - } -} - -template -void parallel_for3d(const T0& D0, const T1& D1, const T2& D2, const F& func) { -#if IE_THREAD == IE_THREAD_TBB - auto work_amount = static_cast(D0 * D1 * D2); - int nthr = parallel_get_max_threads(); - if (static_cast(nthr) > work_amount) - nthr = static_cast(work_amount); - if (nthr == 1) { - for_3d(0, 1, D0, D1, D2, func); - } else { - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - for_3d(ithr, nthr, D0, D1, D2, func); - }, - tbb::static_partitioner()); - } -#elif IE_THREAD == IE_THREAD_TBB_AUTO - const int nthr = parallel_get_max_threads(); - tbb::parallel_for(0, nthr, [&](int ithr) { - for_3d(ithr, nthr, D0, D1, D2, func); - }); -#elif IE_THREAD == IE_THREAD_OMP -# pragma omp parallel - for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func); -#elif IE_THREAD == IE_THREAD_SEQ - for_3d(0, 1, D0, D1, D2, func); -#endif -} - -template -void for_4d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) { - const size_t work_amount = (size_t)D0 * D1 * D2 * D3; - if (work_amount == 0) - return; - size_t start{0}, end{0}; - splitter(work_amount, nthr, ithr, start, end); - - T0 d0{0}; - T1 d1{0}; - T2 d2{0}; - T3 d3{0}; - parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3); - for (size_t iwork = start; iwork < end; ++iwork) { - details::call_with_args(func, ithr, iwork, d0, d1, d2, d3); - parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3); - } -} - -template -void parallel_for4d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const F& func) { -#if IE_THREAD == IE_THREAD_TBB - auto work_amount = static_cast(D0 * D1 * D2 * D3); - int nthr = parallel_get_max_threads(); - if (static_cast(nthr) > work_amount) - nthr = static_cast(work_amount); - if (nthr == 1) { - for_4d(0, 1, D0, D1, D2, D3, func); - } else { - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - for_4d(ithr, nthr, D0, D1, D2, D3, func); - }, - tbb::static_partitioner()); - } -#elif IE_THREAD == IE_THREAD_TBB_AUTO - const int nthr = parallel_get_max_threads(); - tbb::parallel_for(0, nthr, [&](int ithr) { - for_4d(ithr, nthr, D0, D1, D2, D3, func); - }); -#elif IE_THREAD == IE_THREAD_OMP -# pragma omp parallel - for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func); -#elif IE_THREAD == IE_THREAD_SEQ - for_4d(0, 1, D0, D1, D2, D3, func); -#endif -} - -template -void for_5d(const int& ithr, - const int& nthr, - const T0& D0, - const T1& D1, - const T2& D2, - const T3& D3, - const T4& D4, - const F& func) { - const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4; - if (work_amount == 0) - return; - size_t start{0}, end{0}; - splitter(work_amount, nthr, ithr, start, end); - - T0 d0{0}; - T1 d1{0}; - T2 d2{0}; - T3 d3{0}; - T4 d4{0}; - parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); - for (size_t iwork = start; iwork < end; ++iwork) { - details::call_with_args(func, ithr, iwork, d0, d1, d2, d3, d4); - parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); - } -} - -template -void parallel_for5d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const F& func) { -#if IE_THREAD == IE_THREAD_TBB - auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4); - int nthr = parallel_get_max_threads(); - if (static_cast(nthr) > work_amount) - nthr = static_cast(work_amount); - if (nthr == 1) { - for_5d(0, 1, D0, D1, D2, D3, D4, func); - } else { - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - for_5d(ithr, nthr, D0, D1, D2, D3, D4, func); - }, - tbb::static_partitioner()); - } -#elif IE_THREAD == IE_THREAD_TBB_AUTO - const int nthr = parallel_get_max_threads(); - tbb::parallel_for(0, nthr, [&](int ithr) { - for_5d(ithr, nthr, D0, D1, D2, D3, D4, func); - }); -#elif IE_THREAD == IE_THREAD_OMP -# pragma omp parallel - for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func); -#elif IE_THREAD == IE_THREAD_SEQ - for_5d(0, 1, D0, D1, D2, D3, D4, func); -#endif -} - -template -void for_6d(const int& ithr, - const int& nthr, - const T0& D0, - const T1& D1, - const T2& D2, - const T3& D3, - const T4& D4, - const T5& D5, - const F& func) { - const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5; - if (work_amount == 0) - return; - size_t start{0}, end{0}; - splitter(work_amount, nthr, ithr, start, end); - - T0 d0{0}; - T1 d1{0}; - T2 d2{0}; - T3 d3{0}; - T4 d4{0}; - T5 d5{0}; - parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); - for (size_t iwork = start; iwork < end; ++iwork) { - details::call_with_args(func, ithr, iwork, d0, d1, d2, d3, d4, d5); - parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5); - } -} - -template -void parallel_for6d(const T0& D0, const T1& D1, const T2& D2, const T3& D3, const T4& D4, const T5& D5, const F& func) { -#if IE_THREAD == IE_THREAD_TBB - auto work_amount = static_cast(D0 * D1 * D2 * D3 * D4 * D5); - int nthr = parallel_get_max_threads(); - if (static_cast(nthr) > work_amount) - nthr = static_cast(work_amount); - if (nthr == 1) { - for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); - } else { - tbb::parallel_for( - 0, - nthr, - [&](int ithr) { - for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func); - }, - tbb::static_partitioner()); - } -#elif IE_THREAD == IE_THREAD_TBB_AUTO - const int nthr = parallel_get_max_threads(); - tbb::parallel_for(0, nthr, [&](int ithr) { - for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func); - }); -#elif IE_THREAD == IE_THREAD_OMP -# pragma omp parallel - for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func); -#elif IE_THREAD == IE_THREAD_SEQ - for_6d(0, 1, D0, D1, D2, D3, D4, D5, func); -#endif -} +using ov::for_1d; +using ov::for_2d; +using ov::for_3d; +using ov::for_4d; +using ov::for_5d; +using ov::for_6d; +using ov::parallel_for; +using ov::parallel_for2d; +using ov::parallel_for3d; +using ov::parallel_for4d; +using ov::parallel_for5d; +using ov::parallel_for6d; +using ov::parallel_it_init; +using ov::parallel_it_step; +using ov::parallel_nt; +using ov::parallel_nt_static; +using ov::parallel_sort; +using ov::parallel_sum; +using ov::parallel_sum2d; +using ov::parallel_sum3d; +using ov::splitter; } // namespace InferenceEngine