[GPU] Fix reduce fs_b_yx_fsv16 bug for MIN and MAX mode (#15060)
This commit is contained in:
parent
0da339a7f2
commit
5bcfdf15df
@ -1821,14 +1821,14 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
} else if (node.is_type<reduce>()) {
|
||||
auto& reduce_node = node.as<reduce>();
|
||||
auto input_layout = reduce_node.input().get_output_layout();
|
||||
// TODO: Under the currnet implement, dynamic shape doesn't support blocked format. Will support in future.
|
||||
if (!use_onednn_impls && input_layout.is_dynamic()) {
|
||||
if (input_layout.format.dimension() == 6)
|
||||
if (input_layout.format.dimension() == 6) {
|
||||
expected = format::bfwzyx;
|
||||
else if (input_layout.format.dimension() == 5)
|
||||
} else if (input_layout.format.dimension() == 5) {
|
||||
expected = format::bfzyx;
|
||||
else if (input_layout.format.dimension() == 4)
|
||||
expected = format::bfyx;
|
||||
} else if (input_layout.format.dimension() == 4) {
|
||||
expected = format::any;
|
||||
}
|
||||
}
|
||||
} else if (node.is_type<arg_max_min>()) {
|
||||
// Set default format for issue 92967/98750
|
||||
|
@ -336,7 +336,21 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
|
||||
for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
|
||||
for (uint yi = y_out; yi < y_max_val; ++yi) {
|
||||
for (uint xi = x_out; xi < x_max_val; ++xi) {
|
||||
#if HANDLE_FEATURE_REMAINDER
|
||||
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
|
||||
if (fi + FSV <= INPUT0_FEATURE_NUM)
|
||||
input = BLOCK_READ(data, offset);
|
||||
else
|
||||
if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
|
||||
for (int i = 0; i < READ_OFFSET; ++i)
|
||||
input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
|
||||
#else
|
||||
input = BLOCK_READ(data, offset);
|
||||
#endif
|
||||
#else
|
||||
INPUT_VEC input = BLOCK_READ(data, offset);
|
||||
#endif
|
||||
unroll_for (int i = 0; i < READ_OFFSET; ++i)
|
||||
acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
|
||||
offset += input_x_pitch;
|
||||
|
@ -216,6 +216,15 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
|
||||
}
|
||||
}
|
||||
|
||||
// MIN/MAX mode should handle feature remainder in case reduce axes includes feature
|
||||
if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
|
||||
if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
|
||||
if (params.inputs[0].Feature().v % 16 != 0) {
|
||||
jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
|
@ -1679,6 +1679,116 @@ TEST(reduce_gpu, dynamic) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(reduce_gpu, b_fs_yx_fsv16_min_dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
auto input = engine.allocate_memory({data_types::f32, format::bfyx, {1, 17, 1, 2}});
|
||||
|
||||
set_values(input, {
|
||||
1.0f, -1.0f,
|
||||
2.0f, -2.0f,
|
||||
3.0f, -3.0f,
|
||||
4.0f, -4.0f,
|
||||
5.0f, -5.0f,
|
||||
6.0f, -6.0f,
|
||||
7.0f, -7.0f,
|
||||
8.0f, -8.0f,
|
||||
9.0f, -9.0f,
|
||||
8.0f, -8.0f,
|
||||
7.0f, -7.0f,
|
||||
6.0f, -6.0f,
|
||||
5.0f, -5.0f,
|
||||
4.0f, -4.0f,
|
||||
3.0f, -3.0f,
|
||||
2.0f, -2.0f,
|
||||
1.0f, -1.0f
|
||||
});
|
||||
|
||||
topology topology;
|
||||
auto in_layout = layout(ov::PartialShape::dynamic(4), data_types::f32, format::bfyx);
|
||||
const auto used_layout = layout({1, 17, 1, 2}, data_types::f32, format::b_fs_yx_fsv16);
|
||||
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(reorder("reorder", input_info("input"), used_layout));
|
||||
topology.add(reduce("reduce", input_info("reorder"), reduce_mode::min, {1}, 0));
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
network network(engine, topology, config);
|
||||
|
||||
network.set_input_data("input", input);
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "reduce");
|
||||
|
||||
auto output = outputs.at("reduce").get_memory();
|
||||
|
||||
std::vector<float> ref_data = {1.0f, -9.0f};
|
||||
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < ref_data.size(); ++i) {
|
||||
ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(reduce_gpu, b_fs_yx_fsv16_max_dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
auto input = engine.allocate_memory({data_types::f32, format::bfyx, {1, 17, 1, 2}});
|
||||
|
||||
set_values(input, {
|
||||
1.0f, -1.0f,
|
||||
2.0f, -2.0f,
|
||||
3.0f, -3.0f,
|
||||
4.0f, -4.0f,
|
||||
5.0f, -5.0f,
|
||||
6.0f, -6.0f,
|
||||
7.0f, -7.0f,
|
||||
8.0f, -8.0f,
|
||||
9.0f, -9.0f,
|
||||
8.0f, -8.0f,
|
||||
7.0f, -7.0f,
|
||||
6.0f, -6.0f,
|
||||
5.0f, -5.0f,
|
||||
4.0f, -4.0f,
|
||||
3.0f, -3.0f,
|
||||
2.0f, -2.0f,
|
||||
1.0f, -1.0f
|
||||
});
|
||||
|
||||
topology topology;
|
||||
auto in_layout = layout(ov::PartialShape::dynamic(4), data_types::f32, format::bfyx);
|
||||
const auto used_layout = layout({1, 17, 1, 2}, data_types::f32, format::b_fs_yx_fsv16);
|
||||
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(reorder("reorder", input_info("input"), used_layout));
|
||||
topology.add(reduce("reduce", input_info("reorder"), reduce_mode::max, {1}, 0));
|
||||
|
||||
ExecutionConfig config;
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
network network(engine, topology, config);
|
||||
|
||||
network.set_input_data("input", input);
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "reduce");
|
||||
|
||||
auto output = outputs.at("reduce").get_memory();
|
||||
|
||||
std::vector<float> ref_data = {9.0f, -1.0f};
|
||||
|
||||
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < ref_data.size(); ++i) {
|
||||
ASSERT_TRUE(are_equal(ref_data[i], output_ptr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
template <data_types InputT, data_types OutputT>
|
||||
class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
|
||||
protected:
|
||||
|
@ -139,7 +139,7 @@ const ReduceInput dyn1d = {
|
||||
|
||||
const ReduceInput dyn2d = {
|
||||
{
|
||||
{ {-1, -1}, {{4, 5}, {5, 6}} }
|
||||
{ {-1, -1}, {{100, 3}, {5, 6}} }
|
||||
},
|
||||
{1}
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user