From 6e5a4714ac59212110e2525c73b754ffeeed3bb2 Mon Sep 17 00:00:00 2001 From: Alexander Peskov Date: Mon, 26 Oct 2020 21:01:38 +0300 Subject: [PATCH] Make MVN behaviour deterministic (#2458) * Change default parallel reduce alg into deternimistic way Signed-off-by: Alexander Peskov * Introduce ie::parallel_for semantic with group_id parameter Signed-off-by: Alexander Peskov * [CPU] Make MVN behaviour deterministic Signed-off-by: Alexander Peskov --- inference-engine/include/ie_parallel.hpp | 41 +++++++++++++++---- .../mkldnn_plugin/nodes/mkldnn_mvn_node.cpp | 6 +-- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/inference-engine/include/ie_parallel.hpp b/inference-engine/include/ie_parallel.hpp index 667b9b86626..67286f3e165 100644 --- a/inference-engine/include/ie_parallel.hpp +++ b/inference-engine/include/ie_parallel.hpp @@ -15,6 +15,7 @@ #pragma once #include +#include #define IE_THREAD_TBB 0 #define IE_THREAD_OMP 1 @@ -185,7 +186,7 @@ void parallel_sort(I begin, I end, const F& comparator) { template R parallel_sum(const T0& D0, const R& input, const F& func) { #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - return tbb::parallel_reduce( + return tbb::parallel_deterministic_reduce( tbb::blocked_range(0, D0), input, [&](const tbb::blocked_range& r, R init) -> R { R sum = init; @@ -217,7 +218,7 @@ R parallel_sum(const T0& D0, const R& input, const F& func) { template R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) { #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - return tbb::parallel_reduce( + return tbb::parallel_deterministic_reduce( tbb::blocked_range2d(0, D0, 0, D1), input, [&](const tbb::blocked_range2d& r, R init) -> R { R sum = init; @@ -256,7 +257,7 @@ R parallel_sum2d(const T0& D0, const T1& D1, const R& input, const F& func) { template R parallel_sum3d(const T0& D0, const T1& D1, const T2& D2, const R& input, const F& func) { #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) - return tbb::parallel_reduce( + return tbb::parallel_deterministic_reduce( tbb::blocked_range3d(0, D0, 0, D1, 0, D2), input, [&](const tbb::blocked_range3d& r, R init) -> R { R sum = init; @@ -338,11 +339,35 @@ inline void splitter(const T& n, const Q& team, const Q& tid, T& n_start, T& n_e n_end += n_start; } +namespace details { + template + struct num_of_lambda_args : public num_of_lambda_args { + }; + + template + struct num_of_lambda_args { + constexpr static int value = sizeof...(Args); + }; + + template::value> + typename std::enable_if::type + call_with_args(ACT body, size_t g_id, T ...arg) { + body(g_id, arg...); + } + + template::value> + typename std::enable_if::type + call_with_args(ACT body, size_t g_id, T ...arg) { + body(arg...); + } +} // namespace details + template void for_1d(const int& ithr, const int& nthr, const T0& D0, const F& func) { T0 d0 {0}, end {0}; splitter(D0, nthr, ithr, d0, end); - for (; d0 < end; ++d0) func(d0); + for (; d0 < end; ++d0) + details::call_with_args(func, ithr, d0); } template @@ -385,7 +410,7 @@ void for_2d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T1 d1 {0}; parallel_it_init(start, d0, D0, d1, D1); for (size_t iwork = start; iwork < end; ++iwork) { - func(d0, d1); + details::call_with_args(func, ithr, d0, d1); parallel_it_step(d0, D0, d1, D1); } } @@ -431,7 +456,7 @@ void for_3d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T2 d2 {0}; parallel_it_init(start, d0, D0, d1, D1, d2, D2); for (size_t iwork = start; iwork < end; ++iwork) { - func(d0, d1, d2); + details::call_with_args(func, ithr, d0, d1, d2); parallel_it_step(d0, D0, d1, D1, d2, D2); } } @@ -478,7 +503,7 @@ void for_4d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T3 d3 {0}; parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3); for (size_t iwork = start; iwork < end; ++iwork) { - func(d0, d1, d2, d3); + details::call_with_args(func, ithr, d0, d1, d2, d3); parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3); } } @@ -527,7 +552,7 @@ void for_5d(const int& ithr, const int& nthr, const T0& D0, const T1& D1, const T4 d4 {0}; parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); for (size_t iwork = start; iwork < end; ++iwork) { - func(d0, d1, d2, d3, d4); + details::call_with_args(func, ithr, d0, d1, d2, d3, d4); parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4); } } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp index 0605e71a103..a5199a8b3db 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp @@ -1125,11 +1125,10 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con for (int i = 0; i < mean_buffer.size(); i++) mean_buffer[i] = 0.f; - parallel_for2d(D, H, [&](size_t d, size_t h) { + parallel_for2d(D, H, [&](size_t thr_idx, size_t d, size_t h) { for (size_t cb = 0; cb < CB; cb++) { size_t src_off = is_nhwc ? ccb + d * H * W * C + h * W * C + cb * blk_size : ccb + d * H * W * blk_size + h * W * blk_size + cb * D * H * W * blk_size; - auto thr_idx = mkldnn_get_thread_num(); auto mean_buffer_ptr = &mean_buffer[blk_size * cb + aux_buffer_size * thr_idx]; auto arg = jit_mvn_call_args(); @@ -1179,11 +1178,10 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con for (int i = 0; i < variance_buffer.size(); i++) variance_buffer[i] = 0.f; - parallel_for2d(D, H, [&](size_t d, size_t h) { + parallel_for2d(D, H, [&](size_t thr_idx, size_t d, size_t h) { for (size_t cb = 0; cb < CB; cb++) { size_t src_off = is_nhwc ? ccb + d * H * W * C + h * W * C + cb * blk_size : ccb + d * H * W * blk_size + h * W * blk_size + cb * D * H * W * blk_size; - auto thr_idx = mkldnn_get_thread_num(); auto mean_buffer_ptr = &mean_buffer[blk_size * cb]; auto variance_buffer_ptr = &variance_buffer[blk_size * cb + aux_buffer_size * thr_idx];