[GPU] BugFix reduce_b_fs_yx_fsv16 kernel (#17477)

+ Invalid calculation in reducing un-aligned feature axis for b_fs_yx_fsv16 + Some reduce modes are not invariant by using 0 value out of range + Added jit ZERO_INVARIANT_REDUCTION + Enable blocked unit-tests on dGPU by PR#15873 Signed-off-by: Min, Byungil <byungil.min@intel.com>
2023-05-24 13:56:55 +09:00
parent 60d55729aa
commit 0d3b636d1c
3 changed files with 60 additions and 33 deletions
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl
@@ -156,7 +156,7 @@ KERNEL(reduce_fsv16)(
    const uint xy   = (uint)get_global_id(1) * READ_OFFSET;
    const uint x    = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
    const uint y    = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
-#endif
+#endif  // !IS_REDUCE_XY
    const uint bf   = (uint)get_global_id(2) * SIMD;
    const uint b    = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
    const uint f    = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
@@ -252,7 +252,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
            for (uint yi = y_out; yi < y_max_val; ++yi) {
                for (uint xi = x_out; xi < x_max_val; ++xi) {
                    INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
-                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
                        if (fi + FSV <= INPUT0_FEATURE_NUM)
                            input = BLOCK_READ(data, offset);
                        else
@@ -269,7 +269,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
                #if INPUT0_SIZE_X % READ_OFFSET != 0
                    for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) {
                        INPUT0_TYPE leftovers = INIT_VAL;
-                        #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                        #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
                            if (fi + FSV <= INPUT0_FEATURE_NUM)
                                leftovers = DT_INPUT_BLOCK_READ(data, offset);
                            else
@@ -330,15 +330,15 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
        if (get_sub_group_local_id() == 0)
            output[out_idx] = final_result;
    #endif
-#else
+#else  // !REDUCE_X
    ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL);
    for (uint bi = batch_out; bi < batch_max_val; ++bi) {
        for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
+
            for (uint yi = y_out; yi < y_max_val; ++yi) {
                for (uint xi = x_out; xi < x_max_val; ++xi) {
-#if HANDLE_FEATURE_REMAINDER
-                    INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
-                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
+                        INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
                        if (fi + FSV <= INPUT0_FEATURE_NUM)
                            input = BLOCK_READ(data, offset);
                        else
@@ -346,11 +346,9 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
                                for (int i = 0; i < READ_OFFSET; ++i)
                                    input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
                    #else
-                        input = BLOCK_READ(data, offset);
+                        INPUT_VEC input = BLOCK_READ(data, offset);
                    #endif
-#else
-                    INPUT_VEC input = BLOCK_READ(data, offset);
-#endif
+
                    unroll_for (int i = 0; i < READ_OFFSET; ++i)
                        acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
                    offset += input_x_pitch;
@@ -410,7 +408,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
            #endif
        }
    }
-#endif
+#endif  // !REDUCE_X
 }

 #undef SIMD
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp
@@ -12,6 +12,7 @@ namespace kernel_selector {

 static const size_t SIMD = 16;
 static const size_t XY_OPT_F_LIMITS = 96;
+static const size_t AXIS_F = 1;
 static const size_t AXIS_Y = 2;
 static const size_t AXIS_X = 3;
 using NDims = std::vector<kernel_selector::Tensor::Dim>;
@@ -78,6 +79,15 @@ static bool can_opt_reduce_xy(const reduce_params& params) {
        input_dims[1].v <= XY_OPT_F_LIMITS;
 }

+static bool reducing_unaligned_f_axis(const reduce_params& params) {
+    if (count(params.reduceAxes.begin(), params.reduceAxes.end(), AXIS_F) > 0) {
+        if (params.inputs[0].Feature().v % 16 != 0)
+            return true;
+    }
+
+    return false;
+}
+
 ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::F16);
@@ -216,13 +226,14 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
        }
    }

-    // MIN/MAX mode should handle feature remainder in case reduce axes includes feature
-    if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
-        if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
-            if (params.inputs[0].Feature().v % 16 != 0) {
-                jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
-            }
-        }
+    // Some reduction modes are affected by 0 value (e.g. min, max, prod ...)
+    bool zero_invariant_mode = params.reduceMode == ReduceMode::L1 || params.reduceMode == ReduceMode::L2 ||
+                               params.reduceMode == ReduceMode::LOG_SUM || params.reduceMode == ReduceMode::LOG_SUM_EXP ||
+                               params.reduceMode == ReduceMode::MEAN || params.reduceMode == ReduceMode::OR ||
+                               params.reduceMode == ReduceMode::SUM || params.reduceMode == ReduceMode::SUM_SQUARE;
+
+    if (zero_invariant_mode && reducing_unaligned_f_axis(params)) {
+        jit.AddConstant(MakeJitConstant("ZERO_INVARIANT_REDUCTION", 1));
    }

    return jit;
@@ -232,6 +243,8 @@ KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, con
    KernelsData kds = GetCommonKernelsData(params, options);
    const reduce_params& orgParams = static_cast<const reduce_params&>(params);

+    // To get perf gain of reduction of un-aligned f axis,
+    // Reduce kernel uses 0 value out of range in inner block by disabling re-use memory
    if (orgParams.inputs[0].Feature().v % 16 != 0) {
        kds[0].can_reuse_memory = false;
    }
--- a/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/fusions/reduce_fusion_test.cpp
@@ -34,8 +34,10 @@ struct reduce_test_params {

 class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
 public:
-    void execute(reduce_test_params& p, bool is_dynamic = false) {
-        auto input_prim = get_mem(get_input_layout(p));
+    // If an input generator fills values for blocked formats, it sets random values outside of shape.
+    // To avoid this issue made by a generator, it could use a proper planar format given by 'default_format' and add a reorder.
+    void execute(reduce_test_params& p, bool is_dynamic = false, bool use_planar_input = false) {
+        auto input_prim = get_mem(get_input_layout(p, use_planar_input));

        cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
        cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
@@ -63,8 +65,11 @@ public:
        return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format };
    }

-    layout get_input_layout(reduce_test_params& p) {
-        return layout{ p.in_shape, p.data_type, p.input_format };
+    layout get_input_layout(reduce_test_params& p, bool use_planar_input = false) {
+        if (use_planar_input)
+            return layout{ p.in_shape, p.data_type, format::get_default_format(p.input_format)};
+        else
+            return layout{ p.in_shape, p.data_type, p.input_format };
    }

    layout get_output_layout(reduce_test_params& p) {
@@ -113,19 +118,17 @@ public:

 class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
 TEST_P(reduce_eltwise_activation_quantize, basic) {
-    // TODO: Fix me, refer PR(#15873)
-    if (engine.get_device_info().supports_immad)
-        return;
    auto p = GetParam();
    update_out_shape(p);
    create_topologies(
-        input_layout("input", get_input_layout(p)),
+        input_layout("input", get_input_layout(p, true)),
+        reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
        data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
        data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
        data("out_lo", get_mem(get_single_element_layout(p), -128)),
        data("out_hi", get_mem(get_single_element_layout(p), 127)),
        data("eltwise_data", get_mem(get_output_layout(p))),
-        reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
+        reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
        eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
        activation("activation", input_info("eltwise"), activation_func::relu),
        quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"),
@@ -134,20 +137,22 @@ TEST_P(reduce_eltwise_activation_quantize, basic) {
    );

    tolerance = 1.f;
-    execute(p);
+    // Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
+    execute(p, false, true);
 }

 TEST_P(reduce_eltwise_activation_quantize, per_channel) {
    auto p = GetParam();
    update_out_shape(p);
    create_topologies(
-        input_layout("input", get_input_layout(p)),
+        input_layout("input", get_input_layout(p, true)),
+        reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
        data("out_lo", get_mem(get_single_element_layout(p), -128)),
        data("out_hi", get_mem(get_single_element_layout(p), 127)),
        data("eltwise_data", get_mem(get_output_layout(p))),
-        reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
+        reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
        eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
        activation("activation", input_info("eltwise"), activation_func::relu),
        quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"), input_info("out_lo"), input_info("out_hi"), 256, data_types::i8),
@@ -155,7 +160,8 @@ TEST_P(reduce_eltwise_activation_quantize, per_channel) {
    );

    tolerance = 1.f;
-    execute(p);
+    // Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
+    execute(p, false, true);
 }

 INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
@@ -215,7 +221,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
    reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { 2, 1, 0 }, true, "reduce_ref" },
    reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { 4, 3, 0 }, true, "reduce_ref" },
    reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { 3, 2, 1 }, true, "reduce_ref" },
-    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
    reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 1, 3 }, true, "reduce_gpu_b_fs_yx_fsv16" },
    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
    reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { 3, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
@@ -228,6 +233,17 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
    reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 3 }, true, "reduce_gpu_b_fs_yx_fsv16" }
 }));

+INSTANTIATE_TEST_SUITE_P(fusings_gpu_bf_axis, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::prod, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::mean, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum_square, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l1, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l2, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
+    reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::log_sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }
+}));
+
 class reduce_scale_activation : public ReduceFusingTest {};
 TEST_P(reduce_scale_activation, basic) {
    auto p = GetParam();