[CPU] Reduce node improve parallelism (#17615)

2023-06-07 13:11:18 +08:00 · 2023-06-07 13:11:18 +08:00 · 3a1326fb58
commit 3a1326fb58
parent 84bd391369
2 changed files with 47 additions and 6 deletions
--- a/src/plugins/intel_cpu/src/nodes/reduce.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@ -1823,8 +1823,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {

    precision_change = input_prec != output_prec;
    support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp &&
-                    algorithm != Algorithm::ReduceSumSquare &&
-                    (!precision_change || (input_prec == Precision::BF16 && output_prec == Precision::FP32));
+                    algorithm != Algorithm::ReduceSumSquare;

    src_data_size = input_prec.size();
    dst_data_size = output_prec.size();
@ -2527,18 +2526,35 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s
 inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
    const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
    const float divisor = static_cast<float>(integerDivisor);
-    if (layout == ReduceLayoutType::reduce_ncsp || layout == ReduceLayoutType::reduce_nspc) {
+    if (layout == ReduceLayoutType::reduce_ncsp) {
        parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
            uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
            auto arg = jit_reduce_post_call_args();
            arg.dst = static_cast<void *>(out_p);
-            arg.oc_off = layout == ReduceLayoutType::reduce_nspc ? 0 : oc * sizeof(float);
-            arg.channel_size = layout == ReduceLayoutType::reduce_nspc ? OW : OC; // OW is related to nspc-ncsp dimension reinterpret
+            arg.oc_off = oc * sizeof(float);
+            arg.channel_size = OC;
            arg.work_amount = OD * OH * OW;
            arg.divisor = &divisor;
            arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
            (*reduce_post_kernel)(&arg);
        });
+    } else if (layout == ReduceLayoutType::reduce_nspc) {
+        size_t num_threads = static_cast<size_t>(parallel_get_max_threads());
+        size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD;
+        if (OP < num_threads && OW > blk_size)
+            OP *= OH;
+        size_t work_amount = OB * OC * OD * OH * OW / OP;
+        parallel_for(OP, [&](size_t op) {
+            uint8_t *out_p = out_ptr + op * work_amount * dst_data_size;
+            auto arg = jit_reduce_post_call_args();
+            arg.dst = static_cast<void *>(out_p);
+            arg.oc_off = 0;
+            arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret
+            arg.work_amount = work_amount;
+            arg.divisor = &divisor;
+            arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
+            (*reduce_post_kernel)(&arg);
+        });
    } else {
        size_t OCB = div_up(OC, blk_size);
        parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp
@ -304,6 +304,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel = {
    {{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
 };

+std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {
+    {{{}, {{1, 19, 2, 9}}}},
+    {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
+};
+
 std::vector<CPUSpecificParams> cpuParams_4D = {
 #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
        CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
@ -464,6 +469,19 @@ const auto params_NHWC_SmallChannel = testing::Combine(
        testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
        testing::Values(emptyFusingSpec));

+const auto params_SingleBatch = testing::Combine(
+        testing::Combine(
+                testing::ValuesIn(axes),
+                testing::Values(CommonTestUtils::OpType::VECTOR),
+                testing::Values(true),
+                testing::ValuesIn(reductionTypes),
+                testing::ValuesIn(inpOutPrc),
+                testing::Values(ElementType::undefined),
+                testing::Values(ElementType::undefined),
+                testing::ValuesIn(inputShapes_SingleBatch)),
+        testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
+        testing::Values(emptyFusingSpec));
+
 INSTANTIATE_TEST_SUITE_P(
        smoke_Reduce_OneAxis_CPU,
        ReduceCPULayerTest,
@ -516,12 +534,19 @@ INSTANTIATE_TEST_SUITE_P(
 );

 INSTANTIATE_TEST_SUITE_P(
-        smoke_Reducea_NHWC_SmallChannel_CPU,
+        smoke_Reduce_NHWC_SmallChannel_CPU,
        ReduceCPULayerTest,
        params_NHWC_SmallChannel,
        ReduceCPULayerTest::getTestCaseName
 );

+INSTANTIATE_TEST_SUITE_P(
+        smoke_Reduce_SingleBatch_CPU,
+        ReduceCPULayerTest,
+        params_SingleBatch,
+        ReduceCPULayerTest::getTestCaseName
+);
+
 /* ================================ 1.2 No fusion - Logical ================================ */
 const auto params_OneAxis_Logical = testing::Combine(
        testing::Combine(