[GPU] Fix reduce fs_b_yx_fsv16 bug for MIN and MAX mode (#15060)

2023-02-01 09:02:42 +09:00 · 2023-02-01 09:02:42 +09:00 · 5bcfdf15df
commit 5bcfdf15df
parent 0da339a7f2
5 changed files with 139 additions and 6 deletions
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@ -1821,14 +1821,14 @@ format layout_optimizer::get_preferred_format(program_node& node) {
    } else if (node.is_type<reduce>()) {
        auto& reduce_node = node.as<reduce>();
        auto input_layout = reduce_node.input().get_output_layout();
-        // TODO: Under the currnet implement, dynamic shape doesn't support blocked format. Will support in future.
        if (!use_onednn_impls && input_layout.is_dynamic()) {
-            if (input_layout.format.dimension() == 6)
+            if (input_layout.format.dimension() == 6) {
                expected = format::bfwzyx;
-            else if (input_layout.format.dimension() == 5)
+            } else if (input_layout.format.dimension() == 5) {
                expected = format::bfzyx;
-            else if (input_layout.format.dimension() == 4)
-                expected = format::bfyx;
+            } else if (input_layout.format.dimension() == 4) {
+                expected = format::any;
+            }
        }
    } else if (node.is_type<arg_max_min>()) {
        // Set default format for issue 92967/98750
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl
@ -336,7 +336,21 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
        for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
            for (uint yi = y_out; yi < y_max_val; ++yi) {
                for (uint xi = x_out; xi < x_max_val; ++xi) {
+#if HANDLE_FEATURE_REMAINDER
+                    INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
+                    #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
+                        if (fi + FSV <= INPUT0_FEATURE_NUM)
+                            input = BLOCK_READ(data, offset);
+                        else
+                            if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
+                                for (int i = 0; i < READ_OFFSET; ++i)
+                                    input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
+                    #else
+                        input = BLOCK_READ(data, offset);
+                    #endif
+#else
                    INPUT_VEC input = BLOCK_READ(data, offset);
+#endif
                    unroll_for (int i = 0; i < READ_OFFSET; ++i)
                        acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
                    offset += input_x_pitch;
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reduce/reduce_kernel_b_fs_yx_fsv16.cpp
@ -216,6 +216,15 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
        }
    }

+    // MIN/MAX mode should handle feature remainder in case reduce axes includes feature
+    if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
+        if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
+            if (params.inputs[0].Feature().v % 16 != 0) {
+                jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
+            }
+        }
+    }
+
    return jit;
 }

--- a/src/plugins/intel_gpu/tests/test_cases/reduce_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/reduce_gpu_test.cpp
@ -1679,6 +1679,116 @@ TEST(reduce_gpu, dynamic) {
    }
 }

+TEST(reduce_gpu, b_fs_yx_fsv16_min_dynamic) {
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({data_types::f32, format::bfyx, {1, 17, 1, 2}});
+
+    set_values(input, {
+        1.0f, -1.0f,
+        2.0f, -2.0f,
+        3.0f, -3.0f,
+        4.0f, -4.0f,
+        5.0f, -5.0f,
+        6.0f, -6.0f,
+        7.0f, -7.0f,
+        8.0f, -8.0f,
+        9.0f, -9.0f,
+        8.0f, -8.0f,
+        7.0f, -7.0f,
+        6.0f, -6.0f,
+        5.0f, -5.0f,
+        4.0f, -4.0f,
+        3.0f, -3.0f,
+        2.0f, -2.0f,
+        1.0f, -1.0f
+    });
+
+    topology topology;
+    auto in_layout = layout(ov::PartialShape::dynamic(4), data_types::f32, format::bfyx);
+    const auto used_layout = layout({1, 17, 1, 2}, data_types::f32, format::b_fs_yx_fsv16);
+
+    topology.add(input_layout("input", in_layout));
+    topology.add(reorder("reorder", input_info("input"), used_layout));
+    topology.add(reduce("reduce", input_info("reorder"), reduce_mode::min, {1}, 0));
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    network network(engine, topology, config);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.begin()->first, "reduce");
+
+    auto output = outputs.at("reduce").get_memory();
+
+    std::vector<float> ref_data = {1.0f, -9.0f};
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
+    }
+}
+
+TEST(reduce_gpu, b_fs_yx_fsv16_max_dynamic) {
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({data_types::f32, format::bfyx, {1, 17, 1, 2}});
+
+    set_values(input, {
+        1.0f, -1.0f,
+        2.0f, -2.0f,
+        3.0f, -3.0f,
+        4.0f, -4.0f,
+        5.0f, -5.0f,
+        6.0f, -6.0f,
+        7.0f, -7.0f,
+        8.0f, -8.0f,
+        9.0f, -9.0f,
+        8.0f, -8.0f,
+        7.0f, -7.0f,
+        6.0f, -6.0f,
+        5.0f, -5.0f,
+        4.0f, -4.0f,
+        3.0f, -3.0f,
+        2.0f, -2.0f,
+        1.0f, -1.0f
+    });
+
+    topology topology;
+    auto in_layout = layout(ov::PartialShape::dynamic(4), data_types::f32, format::bfyx);
+    const auto used_layout = layout({1, 17, 1, 2}, data_types::f32, format::b_fs_yx_fsv16);
+
+    topology.add(input_layout("input", in_layout));
+    topology.add(reorder("reorder", input_info("input"), used_layout));
+    topology.add(reduce("reduce", input_info("reorder"), reduce_mode::max, {1}, 0)); 
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    network network(engine, topology, config);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.begin()->first, "reduce");
+
+    auto output = outputs.at("reduce").get_memory();
+
+    std::vector<float> ref_data = {9.0f, -1.0f};
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
+    }
+}
+
 template <data_types InputT, data_types OutputT>
 class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
 protected:
--- a/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/reduce.cpp
+++ b/src/tests/functional/plugin/gpu/single_layer_tests/dynamic/reduce.cpp
@ -139,7 +139,7 @@ const ReduceInput dyn1d = {

 const ReduceInput dyn2d = {
    {
-        { {-1, -1}, {{4, 5}, {5, 6}} }
+        { {-1, -1}, {{100, 3}, {5, 6}} }
    },
    {1}
 };