[CPU] Reduce node improve parallelism (#17615)
This commit is contained in:
parent
84bd391369
commit
3a1326fb58
@ -1823,8 +1823,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
precision_change = input_prec != output_prec;
|
||||
support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp &&
|
||||
algorithm != Algorithm::ReduceSumSquare &&
|
||||
(!precision_change || (input_prec == Precision::BF16 && output_prec == Precision::FP32));
|
||||
algorithm != Algorithm::ReduceSumSquare;
|
||||
|
||||
src_data_size = input_prec.size();
|
||||
dst_data_size = output_prec.size();
|
||||
@ -2527,18 +2526,35 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s
|
||||
inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
|
||||
const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
|
||||
const float divisor = static_cast<float>(integerDivisor);
|
||||
if (layout == ReduceLayoutType::reduce_ncsp || layout == ReduceLayoutType::reduce_nspc) {
|
||||
if (layout == ReduceLayoutType::reduce_ncsp) {
|
||||
parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
|
||||
uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
|
||||
auto arg = jit_reduce_post_call_args();
|
||||
arg.dst = static_cast<void *>(out_p);
|
||||
arg.oc_off = layout == ReduceLayoutType::reduce_nspc ? 0 : oc * sizeof(float);
|
||||
arg.channel_size = layout == ReduceLayoutType::reduce_nspc ? OW : OC; // OW is related to nspc-ncsp dimension reinterpret
|
||||
arg.oc_off = oc * sizeof(float);
|
||||
arg.channel_size = OC;
|
||||
arg.work_amount = OD * OH * OW;
|
||||
arg.divisor = &divisor;
|
||||
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
|
||||
(*reduce_post_kernel)(&arg);
|
||||
});
|
||||
} else if (layout == ReduceLayoutType::reduce_nspc) {
|
||||
size_t num_threads = static_cast<size_t>(parallel_get_max_threads());
|
||||
size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD;
|
||||
if (OP < num_threads && OW > blk_size)
|
||||
OP *= OH;
|
||||
size_t work_amount = OB * OC * OD * OH * OW / OP;
|
||||
parallel_for(OP, [&](size_t op) {
|
||||
uint8_t *out_p = out_ptr + op * work_amount * dst_data_size;
|
||||
auto arg = jit_reduce_post_call_args();
|
||||
arg.dst = static_cast<void *>(out_p);
|
||||
arg.oc_off = 0;
|
||||
arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret
|
||||
arg.work_amount = work_amount;
|
||||
arg.divisor = &divisor;
|
||||
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
|
||||
(*reduce_post_kernel)(&arg);
|
||||
});
|
||||
} else {
|
||||
size_t OCB = div_up(OC, blk_size);
|
||||
parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
|
||||
|
@ -304,6 +304,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel = {
|
||||
{{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
|
||||
};
|
||||
|
||||
std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {
|
||||
{{{}, {{1, 19, 2, 9}}}},
|
||||
{{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
|
||||
};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
@ -464,6 +469,19 @@ const auto params_NHWC_SmallChannel = testing::Combine(
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
|
||||
testing::Values(emptyFusingSpec));
|
||||
|
||||
const auto params_SingleBatch = testing::Combine(
|
||||
testing::Combine(
|
||||
testing::ValuesIn(axes),
|
||||
testing::Values(CommonTestUtils::OpType::VECTOR),
|
||||
testing::Values(true),
|
||||
testing::ValuesIn(reductionTypes),
|
||||
testing::ValuesIn(inpOutPrc),
|
||||
testing::Values(ElementType::undefined),
|
||||
testing::Values(ElementType::undefined),
|
||||
testing::ValuesIn(inputShapes_SingleBatch)),
|
||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
|
||||
testing::Values(emptyFusingSpec));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
smoke_Reduce_OneAxis_CPU,
|
||||
ReduceCPULayerTest,
|
||||
@ -516,12 +534,19 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
smoke_Reducea_NHWC_SmallChannel_CPU,
|
||||
smoke_Reduce_NHWC_SmallChannel_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_NHWC_SmallChannel,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
smoke_Reduce_SingleBatch_CPU,
|
||||
ReduceCPULayerTest,
|
||||
params_SingleBatch,
|
||||
ReduceCPULayerTest::getTestCaseName
|
||||
);
|
||||
|
||||
/* ================================ 1.2 No fusion - Logical ================================ */
|
||||
const auto params_OneAxis_Logical = testing::Combine(
|
||||
testing::Combine(
|
||||
|
Loading…
Reference in New Issue
Block a user