[CPU] Reduce node improve parallelism (#17615)

This commit is contained in:
Chen Xu 2023-06-07 13:11:18 +08:00 committed by GitHub
parent 84bd391369
commit 3a1326fb58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 6 deletions

View File

@ -1823,8 +1823,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
precision_change = input_prec != output_prec;
support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp &&
algorithm != Algorithm::ReduceSumSquare &&
(!precision_change || (input_prec == Precision::BF16 && output_prec == Precision::FP32));
algorithm != Algorithm::ReduceSumSquare;
src_data_size = input_prec.size();
dst_data_size = output_prec.size();
@ -2527,18 +2526,35 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s
inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
const float divisor = static_cast<float>(integerDivisor);
if (layout == ReduceLayoutType::reduce_ncsp || layout == ReduceLayoutType::reduce_nspc) {
if (layout == ReduceLayoutType::reduce_ncsp) {
parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
auto arg = jit_reduce_post_call_args();
arg.dst = static_cast<void *>(out_p);
arg.oc_off = layout == ReduceLayoutType::reduce_nspc ? 0 : oc * sizeof(float);
arg.channel_size = layout == ReduceLayoutType::reduce_nspc ? OW : OC; // OW is related to nspc-ncsp dimension reinterpret
arg.oc_off = oc * sizeof(float);
arg.channel_size = OC;
arg.work_amount = OD * OH * OW;
arg.divisor = &divisor;
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
(*reduce_post_kernel)(&arg);
});
} else if (layout == ReduceLayoutType::reduce_nspc) {
size_t num_threads = static_cast<size_t>(parallel_get_max_threads());
size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD;
if (OP < num_threads && OW > blk_size)
OP *= OH;
size_t work_amount = OB * OC * OD * OH * OW / OP;
parallel_for(OP, [&](size_t op) {
uint8_t *out_p = out_ptr + op * work_amount * dst_data_size;
auto arg = jit_reduce_post_call_args();
arg.dst = static_cast<void *>(out_p);
arg.oc_off = 0;
arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret
arg.work_amount = work_amount;
arg.divisor = &divisor;
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
(*reduce_post_kernel)(&arg);
});
} else {
size_t OCB = div_up(OC, blk_size);
parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {

View File

@ -304,6 +304,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel = {
{{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
};
std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {
{{{}, {{1, 19, 2, 9}}}},
{{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
};
std::vector<CPUSpecificParams> cpuParams_4D = {
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
@ -464,6 +469,19 @@ const auto params_NHWC_SmallChannel = testing::Combine(
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
testing::Values(emptyFusingSpec));
const auto params_SingleBatch = testing::Combine(
testing::Combine(
testing::ValuesIn(axes),
testing::Values(CommonTestUtils::OpType::VECTOR),
testing::Values(true),
testing::ValuesIn(reductionTypes),
testing::ValuesIn(inpOutPrc),
testing::Values(ElementType::undefined),
testing::Values(ElementType::undefined),
testing::ValuesIn(inputShapes_SingleBatch)),
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
testing::Values(emptyFusingSpec));
INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_OneAxis_CPU,
ReduceCPULayerTest,
@ -516,12 +534,19 @@ INSTANTIATE_TEST_SUITE_P(
);
INSTANTIATE_TEST_SUITE_P(
smoke_Reducea_NHWC_SmallChannel_CPU,
smoke_Reduce_NHWC_SmallChannel_CPU,
ReduceCPULayerTest,
params_NHWC_SmallChannel,
ReduceCPULayerTest::getTestCaseName
);
INSTANTIATE_TEST_SUITE_P(
smoke_Reduce_SingleBatch_CPU,
ReduceCPULayerTest,
params_SingleBatch,
ReduceCPULayerTest::getTestCaseName
);
/* ================================ 1.2 No fusion - Logical ================================ */
const auto params_OneAxis_Logical = testing::Combine(
testing::Combine(