[GPU] Fix reduce fs_b_yx_fsv16 bug for MIN and MAX mode (#15060)

This commit is contained in:
Kelvin Choi 2023-02-01 09:02:42 +09:00 committed by GitHub
parent 0da339a7f2
commit 5bcfdf15df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 139 additions and 6 deletions

View File

@ -1821,14 +1821,14 @@ format layout_optimizer::get_preferred_format(program_node& node) {
} else if (node.is_type<reduce>()) {
auto& reduce_node = node.as<reduce>();
auto input_layout = reduce_node.input().get_output_layout();
// TODO: Under the currnet implement, dynamic shape doesn't support blocked format. Will support in future.
if (!use_onednn_impls && input_layout.is_dynamic()) {
if (input_layout.format.dimension() == 6)
if (input_layout.format.dimension() == 6) {
expected = format::bfwzyx;
else if (input_layout.format.dimension() == 5)
} else if (input_layout.format.dimension() == 5) {
expected = format::bfzyx;
else if (input_layout.format.dimension() == 4)
expected = format::bfyx;
} else if (input_layout.format.dimension() == 4) {
expected = format::any;
}
}
} else if (node.is_type<arg_max_min>()) {
// Set default format for issue 92967/98750

View File

@ -336,7 +336,21 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
for (uint yi = y_out; yi < y_max_val; ++yi) {
for (uint xi = x_out; xi < x_max_val; ++xi) {
#if HANDLE_FEATURE_REMAINDER
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
if (fi + FSV <= INPUT0_FEATURE_NUM)
input = BLOCK_READ(data, offset);
else
if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
for (int i = 0; i < READ_OFFSET; ++i)
input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
#else
input = BLOCK_READ(data, offset);
#endif
#else
INPUT_VEC input = BLOCK_READ(data, offset);
#endif
unroll_for (int i = 0; i < READ_OFFSET; ++i)
acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
offset += input_x_pitch;

View File

@ -216,6 +216,15 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
}
}
// MIN/MAX mode should handle feature remainder in case reduce axes includes feature
if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
if (params.inputs[0].Feature().v % 16 != 0) {
jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
}
}
}
return jit;
}

View File

@ -1679,6 +1679,116 @@ TEST(reduce_gpu, dynamic) {
}
}
TEST(reduce_gpu, b_fs_yx_fsv16_min_dynamic) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({data_types::f32, format::bfyx, {1, 17, 1, 2}});
set_values(input, {
1.0f, -1.0f,
2.0f, -2.0f,
3.0f, -3.0f,
4.0f, -4.0f,
5.0f, -5.0f,
6.0f, -6.0f,
7.0f, -7.0f,
8.0f, -8.0f,
9.0f, -9.0f,
8.0f, -8.0f,
7.0f, -7.0f,
6.0f, -6.0f,
5.0f, -5.0f,
4.0f, -4.0f,
3.0f, -3.0f,
2.0f, -2.0f,
1.0f, -1.0f
});
topology topology;
auto in_layout = layout(ov::PartialShape::dynamic(4), data_types::f32, format::bfyx);
const auto used_layout = layout({1, 17, 1, 2}, data_types::f32, format::b_fs_yx_fsv16);
topology.add(input_layout("input", in_layout));
topology.add(reorder("reorder", input_info("input"), used_layout));
topology.add(reduce("reduce", input_info("reorder"), reduce_mode::min, {1}, 0));
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "reduce");
auto output = outputs.at("reduce").get_memory();
std::vector<float> ref_data = {1.0f, -9.0f};
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
for (size_t i = 0; i < ref_data.size(); ++i) {
ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
}
}
TEST(reduce_gpu, b_fs_yx_fsv16_max_dynamic) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({data_types::f32, format::bfyx, {1, 17, 1, 2}});
set_values(input, {
1.0f, -1.0f,
2.0f, -2.0f,
3.0f, -3.0f,
4.0f, -4.0f,
5.0f, -5.0f,
6.0f, -6.0f,
7.0f, -7.0f,
8.0f, -8.0f,
9.0f, -9.0f,
8.0f, -8.0f,
7.0f, -7.0f,
6.0f, -6.0f,
5.0f, -5.0f,
4.0f, -4.0f,
3.0f, -3.0f,
2.0f, -2.0f,
1.0f, -1.0f
});
topology topology;
auto in_layout = layout(ov::PartialShape::dynamic(4), data_types::f32, format::bfyx);
const auto used_layout = layout({1, 17, 1, 2}, data_types::f32, format::b_fs_yx_fsv16);
topology.add(input_layout("input", in_layout));
topology.add(reorder("reorder", input_info("input"), used_layout));
topology.add(reduce("reduce", input_info("reorder"), reduce_mode::max, {1}, 0));
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "reduce");
auto output = outputs.at("reduce").get_memory();
std::vector<float> ref_data = {9.0f, -1.0f};
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
for (size_t i = 0; i < ref_data.size(); ++i) {
ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
}
}
template <data_types InputT, data_types OutputT>
class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
protected:

View File

@ -139,7 +139,7 @@ const ReduceInput dyn1d = {
const ReduceInput dyn2d = {
{
{ {-1, -1}, {{4, 5}, {5, 6}} }
{ {-1, -1}, {{100, 3}, {5, 6}} }
},
{1}
};