diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index 3284499497d..1164227643c 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -1823,8 +1823,7 @@ void Reduce::initSupportedPrimitiveDescriptors() { precision_change = input_prec != output_prec; support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp && - algorithm != Algorithm::ReduceSumSquare && - (!precision_change || (input_prec == Precision::BF16 && output_prec == Precision::FP32)); + algorithm != Algorithm::ReduceSumSquare; src_data_size = input_prec.size(); dst_data_size = output_prec.size(); @@ -2527,18 +2526,35 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) { const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW); const float divisor = static_cast(integerDivisor); - if (layout == ReduceLayoutType::reduce_ncsp || layout == ReduceLayoutType::reduce_nspc) { + if (layout == ReduceLayoutType::reduce_ncsp) { parallel_for2d(OB, OC, [&](size_t ob, size_t oc) { uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size; auto arg = jit_reduce_post_call_args(); arg.dst = static_cast(out_p); - arg.oc_off = layout == ReduceLayoutType::reduce_nspc ? 0 : oc * sizeof(float); - arg.channel_size = layout == ReduceLayoutType::reduce_nspc ? OW : OC; // OW is related to nspc-ncsp dimension reinterpret + arg.oc_off = oc * sizeof(float); + arg.channel_size = OC; arg.work_amount = OD * OH * OW; arg.divisor = &divisor; arg.post_op_data = static_cast(postOpsDataPtrs.data()); (*reduce_post_kernel)(&arg); }); + } else if (layout == ReduceLayoutType::reduce_nspc) { + size_t num_threads = static_cast(parallel_get_max_threads()); + size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD; + if (OP < num_threads && OW > blk_size) + OP *= OH; + size_t work_amount = OB * OC * OD * OH * OW / OP; + parallel_for(OP, [&](size_t op) { + uint8_t *out_p = out_ptr + op * work_amount * dst_data_size; + auto arg = jit_reduce_post_call_args(); + arg.dst = static_cast(out_p); + arg.oc_off = 0; + arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret + arg.work_amount = work_amount; + arg.divisor = &divisor; + arg.post_op_data = static_cast(postOpsDataPtrs.data()); + (*reduce_post_kernel)(&arg); + }); } else { size_t OCB = div_up(OC, blk_size); parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) { diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp index c751387b5b3..93a45ff898e 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp @@ -304,6 +304,11 @@ std::vector> inputShapes_SmallChannel = { {{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}}, }; +std::vector> inputShapes_SingleBatch = { + {{{}, {{1, 19, 2, 9}}}}, + {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}}, +}; + std::vector cpuParams_4D = { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}), @@ -464,6 +469,19 @@ const auto params_NHWC_SmallChannel = testing::Combine( testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)), testing::Values(emptyFusingSpec)); +const auto params_SingleBatch = testing::Combine( + testing::Combine( + testing::ValuesIn(axes), + testing::Values(CommonTestUtils::OpType::VECTOR), + testing::Values(true), + testing::ValuesIn(reductionTypes), + testing::ValuesIn(inpOutPrc), + testing::Values(ElementType::undefined), + testing::Values(ElementType::undefined), + testing::ValuesIn(inputShapes_SingleBatch)), + testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)), + testing::Values(emptyFusingSpec)); + INSTANTIATE_TEST_SUITE_P( smoke_Reduce_OneAxis_CPU, ReduceCPULayerTest, @@ -516,12 +534,19 @@ INSTANTIATE_TEST_SUITE_P( ); INSTANTIATE_TEST_SUITE_P( - smoke_Reducea_NHWC_SmallChannel_CPU, + smoke_Reduce_NHWC_SmallChannel_CPU, ReduceCPULayerTest, params_NHWC_SmallChannel, ReduceCPULayerTest::getTestCaseName ); +INSTANTIATE_TEST_SUITE_P( + smoke_Reduce_SingleBatch_CPU, + ReduceCPULayerTest, + params_SingleBatch, + ReduceCPULayerTest::getTestCaseName +); + /* ================================ 1.2 No fusion - Logical ================================ */ const auto params_OneAxis_Logical = testing::Combine( testing::Combine(