[CPU] Reduce node improve parallelism (#17615)
This commit is contained in:
parent
84bd391369
commit
3a1326fb58
@ -1823,8 +1823,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
|
|||||||
|
|
||||||
precision_change = input_prec != output_prec;
|
precision_change = input_prec != output_prec;
|
||||||
support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp &&
|
support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp &&
|
||||||
algorithm != Algorithm::ReduceSumSquare &&
|
algorithm != Algorithm::ReduceSumSquare;
|
||||||
(!precision_change || (input_prec == Precision::BF16 && output_prec == Precision::FP32));
|
|
||||||
|
|
||||||
src_data_size = input_prec.size();
|
src_data_size = input_prec.size();
|
||||||
dst_data_size = output_prec.size();
|
dst_data_size = output_prec.size();
|
||||||
@ -2527,18 +2526,35 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s
|
|||||||
inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
|
inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
|
||||||
const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
|
const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
|
||||||
const float divisor = static_cast<float>(integerDivisor);
|
const float divisor = static_cast<float>(integerDivisor);
|
||||||
if (layout == ReduceLayoutType::reduce_ncsp || layout == ReduceLayoutType::reduce_nspc) {
|
if (layout == ReduceLayoutType::reduce_ncsp) {
|
||||||
parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
|
parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
|
||||||
uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
|
uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
|
||||||
auto arg = jit_reduce_post_call_args();
|
auto arg = jit_reduce_post_call_args();
|
||||||
arg.dst = static_cast<void *>(out_p);
|
arg.dst = static_cast<void *>(out_p);
|
||||||
arg.oc_off = layout == ReduceLayoutType::reduce_nspc ? 0 : oc * sizeof(float);
|
arg.oc_off = oc * sizeof(float);
|
||||||
arg.channel_size = layout == ReduceLayoutType::reduce_nspc ? OW : OC; // OW is related to nspc-ncsp dimension reinterpret
|
arg.channel_size = OC;
|
||||||
arg.work_amount = OD * OH * OW;
|
arg.work_amount = OD * OH * OW;
|
||||||
arg.divisor = &divisor;
|
arg.divisor = &divisor;
|
||||||
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
|
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
|
||||||
(*reduce_post_kernel)(&arg);
|
(*reduce_post_kernel)(&arg);
|
||||||
});
|
});
|
||||||
|
} else if (layout == ReduceLayoutType::reduce_nspc) {
|
||||||
|
size_t num_threads = static_cast<size_t>(parallel_get_max_threads());
|
||||||
|
size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD;
|
||||||
|
if (OP < num_threads && OW > blk_size)
|
||||||
|
OP *= OH;
|
||||||
|
size_t work_amount = OB * OC * OD * OH * OW / OP;
|
||||||
|
parallel_for(OP, [&](size_t op) {
|
||||||
|
uint8_t *out_p = out_ptr + op * work_amount * dst_data_size;
|
||||||
|
auto arg = jit_reduce_post_call_args();
|
||||||
|
arg.dst = static_cast<void *>(out_p);
|
||||||
|
arg.oc_off = 0;
|
||||||
|
arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret
|
||||||
|
arg.work_amount = work_amount;
|
||||||
|
arg.divisor = &divisor;
|
||||||
|
arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
|
||||||
|
(*reduce_post_kernel)(&arg);
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
size_t OCB = div_up(OC, blk_size);
|
size_t OCB = div_up(OC, blk_size);
|
||||||
parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
|
parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
|
||||||
|
@ -304,6 +304,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel = {
|
|||||||
{{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
|
{{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {
|
||||||
|
{{{}, {{1, 19, 2, 9}}}},
|
||||||
|
{{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
|
||||||
|
};
|
||||||
|
|
||||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||||
@ -464,6 +469,19 @@ const auto params_NHWC_SmallChannel = testing::Combine(
|
|||||||
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
|
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
|
||||||
testing::Values(emptyFusingSpec));
|
testing::Values(emptyFusingSpec));
|
||||||
|
|
||||||
|
const auto params_SingleBatch = testing::Combine(
|
||||||
|
testing::Combine(
|
||||||
|
testing::ValuesIn(axes),
|
||||||
|
testing::Values(CommonTestUtils::OpType::VECTOR),
|
||||||
|
testing::Values(true),
|
||||||
|
testing::ValuesIn(reductionTypes),
|
||||||
|
testing::ValuesIn(inpOutPrc),
|
||||||
|
testing::Values(ElementType::undefined),
|
||||||
|
testing::Values(ElementType::undefined),
|
||||||
|
testing::ValuesIn(inputShapes_SingleBatch)),
|
||||||
|
testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
|
||||||
|
testing::Values(emptyFusingSpec));
|
||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
smoke_Reduce_OneAxis_CPU,
|
smoke_Reduce_OneAxis_CPU,
|
||||||
ReduceCPULayerTest,
|
ReduceCPULayerTest,
|
||||||
@ -516,12 +534,19 @@ INSTANTIATE_TEST_SUITE_P(
|
|||||||
);
|
);
|
||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
smoke_Reducea_NHWC_SmallChannel_CPU,
|
smoke_Reduce_NHWC_SmallChannel_CPU,
|
||||||
ReduceCPULayerTest,
|
ReduceCPULayerTest,
|
||||||
params_NHWC_SmallChannel,
|
params_NHWC_SmallChannel,
|
||||||
ReduceCPULayerTest::getTestCaseName
|
ReduceCPULayerTest::getTestCaseName
|
||||||
);
|
);
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
smoke_Reduce_SingleBatch_CPU,
|
||||||
|
ReduceCPULayerTest,
|
||||||
|
params_SingleBatch,
|
||||||
|
ReduceCPULayerTest::getTestCaseName
|
||||||
|
);
|
||||||
|
|
||||||
/* ================================ 1.2 No fusion - Logical ================================ */
|
/* ================================ 1.2 No fusion - Logical ================================ */
|
||||||
const auto params_OneAxis_Logical = testing::Combine(
|
const auto params_OneAxis_Logical = testing::Combine(
|
||||||
testing::Combine(
|
testing::Combine(
|
||||||
|
Loading…
Reference in New Issue
Block a user