From 1e6447c26a6fae172dc3efdf55c3eb0362a8ca7f Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Wed, 31 Aug 2022 18:48:45 +0800 Subject: [PATCH] [CPU] Reduce node improve performance for nspc layout (#12638) --- src/plugins/intel_cpu/src/nodes/reduce.cpp | 82 ++++++++++++++++++---- src/plugins/intel_cpu/src/nodes/reduce.h | 8 ++- 2 files changed, 74 insertions(+), 16 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 3c06e1cf401..5a2090328fc 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -1753,6 +1753,7 @@ Reduce::Reduce(const std::shared_ptr& op, const dnnl::engine& eng, IE_THROW() << errorPrefix << " second tensor is not constant!"; raw_axes = reduceConst->cast_vector(); } + vec_reduceDH_prc.clear(); setJITBeyond5D(); } else { IE_THROW(NotImplemented) << errorMessage; @@ -1810,6 +1811,9 @@ void Reduce::initSupportedPrimitiveDescriptors() { } } + support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp && + algorithm != Algorithm::ReduceSumSquare && input_prec == output_prec; + src_data_size = input_prec.size(); dst_data_size = output_prec.size(); @@ -1967,6 +1971,12 @@ void Reduce::createPrimitive() { compile_post_kernel = true; + if (mayiuse(cpu::x64::avx512_core)) { + blk_size = 16; + } else { + blk_size = 8; + } + if (inputShapesDefined()) { if (needPrepareParams()) prepareParams(); @@ -1975,13 +1985,10 @@ void Reduce::createPrimitive() { if (mayiuse(cpu::x64::avx512_core)) { reduce_kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); - blk_size = 16; } else if (mayiuse(cpu::x64::avx2)) { reduce_kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); - blk_size = 8; } else if (mayiuse(cpu::x64::sse41)) { reduce_kernel.reset(new jit_uni_reduce_kernel_f32(jcp)); - blk_size = 8; } if (reduce_kernel) reduce_kernel->create_ker(); @@ -2138,17 +2145,46 @@ void Reduce::reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr) { IW - tail_start, 0, IH); }); } else if (!ReduceC && ReduceD && ReduceH && !ReduceW) { - parallel_for(IC, [&](size_t ic) { - size_t oc = ic; GET_PTR_NC_PLN; - parallel_for(IW / blk_size, [&](size_t ibw){ - size_t obw = ibw; - reduce_kernel_process(in_ptr_nc + ibw * blk_size * src_data_size, out_ptr_nc + obw * blk_size * dst_data_size, - blk_size, 0, ID * IH); + size_t IWB = IW / blk_size; + if (ReduceDH_opt) { + // reduce parallelly in D dimension + // step1: !ReduceD && ReduceH && !ReduceW + uint8_t *prc_ptr_n = &vec_reduceDH_prc[0]; + init_dst_data(prc_ptr_n, prc_size); + parallel_for2d(ID, IWB, [&](size_t id, size_t iwb){ + size_t pd = id, pwb = iwb; + reduce_kernel_process(in_ptr_n + (id * IH * IW + iwb * blk_size) * src_data_size, + prc_ptr_n + (pd * PW + pwb * blk_size) * prc_data_size, blk_size, 0, IH); }); - size_t tail_start = IW / blk_size * blk_size; - reduce_kernel_process(in_ptr_nc + tail_start * src_data_size, out_ptr_nc + tail_start * dst_data_size, - IW - tail_start, 0, ID * IH); - }); + // step2: ReduceD + reduce_stride = PW; + parallel_for(IWB, [&](size_t iwb){ + size_t pwb = iwb, owb = iwb; + reduce_kernel_process(prc_ptr_n + pwb * blk_size * prc_data_size, + out_ptr_n + owb * blk_size * dst_data_size, blk_size, 0, ID); + }); + // reduce tail + reduce_stride = IW; + size_t tail_start = IWB * blk_size; + parallel_for(IW - tail_start, [&](size_t i_tail) { + reduce_kernel_process(in_ptr_n + (tail_start + i_tail) * src_data_size, out_ptr_n + (tail_start + i_tail) * dst_data_size, + 1, 0, ID * IH); + }); + } else { + parallel_for(IC, [&](size_t ic) { + size_t oc = ic; GET_PTR_NC_PLN; + parallel_for(IWB, [&](size_t iwb){ + size_t owb = iwb; + reduce_kernel_process(in_ptr_nc + iwb * blk_size * src_data_size, out_ptr_nc + owb * blk_size * dst_data_size, + blk_size, 0, ID * IH); + }); + size_t tail_start = IWB * blk_size; + parallel_for(IW - tail_start, [&](size_t i_tail) { + reduce_kernel_process(in_ptr_nc + (tail_start + i_tail) * src_data_size, out_ptr_nc + (tail_start + i_tail) * dst_data_size, + 1, 0, ID * IH); + }); + }); + } } else if (ReduceC && ReduceD && ReduceH && !ReduceW) { parallel_for(IW / blk_size, [&](size_t ibw){ size_t obw = ibw; @@ -2207,8 +2243,7 @@ void Reduce::reduce_BLK(const uint8_t *in_ptr, uint8_t *out_ptr) { reduce_kernel_process(in_ptr_ncd, out_ptr_ncd, IH * IW * blk_size); }); } else if (ReduceC && ReduceD && ReduceH && ReduceW) { - if (input_prec != output_prec || getAlgorithm() == Algorithm::ReduceL2 || - algorithm == Algorithm::ReduceLogSumExp || algorithm == Algorithm::ReduceSumSquare) { + if (!support_split) { reduce_kernel_process(in_ptr_n, out_ptr_n, ICB * ID * IH * IW * blk_size); } else { // reduce parallelly @@ -2608,6 +2643,20 @@ inline void Reduce::create_working_memory() { dst_size = desc.get_size(); } +inline void Reduce::create_DH_working_memory() { + ReduceDH_opt = layout == ReduceLayoutType::reduce_nspc && !isDynamicNode() && support_split && + !ReduceC && ReduceD && ReduceH && !ReduceW && IC == 1 && ID > 1; + if (ReduceDH_opt) { + PD = ID; + PW = IW / blk_size * blk_size; + prc_data_size = src_data_size; + prc_size = PD * PW * src_data_size; + if (prc_size > vec_reduceDH_prc.size()) { + vec_reduceDH_prc.resize(prc_size); + } + } +} + inline void Reduce::calc_process_dst_dims(std::vector &reduce_axes, const SizeVector &dst_dims) { std::set axes; SizeVector out_dims; @@ -2691,6 +2740,9 @@ inline void Reduce::set_reduce_dim_flags() { ReduceH = IH != OH && OH == 1; ReduceW = IW != OW && OW == 1; + // must be done before the above dimension change + create_DH_working_memory(); + // suit for parallel if (ReduceH && IW == 1) { ReduceW = true; diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h index f7460346561..4c2bd777f4d 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.h +++ b/src/plugins/intel_cpu/src/nodes/reduce.h @@ -114,6 +114,7 @@ private: inline void reduce_kernel_post_process(uint8_t *out_ptr); inline void init_dst_data(uint8_t *out_ptr, size_t dst_size); inline void create_working_memory(); + inline void create_DH_working_memory(); inline void calc_process_dst_dims(std::vector &reduce_axes, const InferenceEngine::SizeVector &dst_dim); inline void set_reduce_dim_flags(); inline void reduce_ref(const float *in_ptr, float *out_ptr); @@ -128,6 +129,7 @@ private: size_t blk_size; size_t dst_size; + size_t prc_size; static const size_t REDUCE_DATA = 0; static const size_t REDUCE_INDEXES = 1; bool jit_beyond_5D = false; @@ -135,10 +137,13 @@ private: bool keep_dims = true; bool is_hybrid_layout = false; bool compile_post_kernel = true; + bool support_split = false; + bool ReduceDH_opt = false; bool ReduceN, ReduceC, ReduceD, ReduceH, ReduceW; size_t IB, IC, ID, IH, IW; size_t OB, OC, OD, OH, OW; - size_t src_data_size, dst_data_size; + size_t PD, PW; + size_t src_data_size, dst_data_size, prc_data_size; size_t reduce_stride; ReduceLayoutType layout; InferenceEngine::Precision input_prec, output_prec; @@ -154,6 +159,7 @@ private: std::vector postOpsDataPtrs; std::shared_ptr prc_mem; + std::vector vec_reduceDH_prc; std::shared_ptr reduce_kernel; std::shared_ptr reduce_post_kernel;