[GPU] BugFix reduce_b_fs_yx_fsv16 kernel (#17477)
+ Invalid calculation in reducing un-aligned feature axis for b_fs_yx_fsv16 + Some reduce modes are not invariant by using 0 value out of range + Added jit ZERO_INVARIANT_REDUCTION + Enable blocked unit-tests on dGPU by PR#15873 Signed-off-by: Min, Byungil <byungil.min@intel.com>
This commit is contained in:
@@ -156,7 +156,7 @@ KERNEL(reduce_fsv16)(
|
||||
const uint xy = (uint)get_global_id(1) * READ_OFFSET;
|
||||
const uint x = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
|
||||
const uint y = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
|
||||
#endif
|
||||
#endif // !IS_REDUCE_XY
|
||||
const uint bf = (uint)get_global_id(2) * SIMD;
|
||||
const uint b = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
|
||||
const uint f = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
|
||||
@@ -252,7 +252,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
|
||||
for (uint yi = y_out; yi < y_max_val; ++yi) {
|
||||
for (uint xi = x_out; xi < x_max_val; ++xi) {
|
||||
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
|
||||
if (fi + FSV <= INPUT0_FEATURE_NUM)
|
||||
input = BLOCK_READ(data, offset);
|
||||
else
|
||||
@@ -269,7 +269,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
|
||||
#if INPUT0_SIZE_X % READ_OFFSET != 0
|
||||
for (uint xi = x_leftover_start; xi < x_leftover_end; ++xi) {
|
||||
INPUT0_TYPE leftovers = INIT_VAL;
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
|
||||
if (fi + FSV <= INPUT0_FEATURE_NUM)
|
||||
leftovers = DT_INPUT_BLOCK_READ(data, offset);
|
||||
else
|
||||
@@ -330,15 +330,15 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
|
||||
if (get_sub_group_local_id() == 0)
|
||||
output[out_idx] = final_result;
|
||||
#endif
|
||||
#else
|
||||
#else // !REDUCE_X
|
||||
ACCUMULATOR_VEC acc = (ACCUMULATOR_VEC)(INIT_VAL);
|
||||
for (uint bi = batch_out; bi < batch_max_val; ++bi) {
|
||||
for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
|
||||
|
||||
for (uint yi = y_out; yi < y_max_val; ++yi) {
|
||||
for (uint xi = x_out; xi < x_max_val; ++xi) {
|
||||
#if HANDLE_FEATURE_REMAINDER
|
||||
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
|
||||
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) && !ZERO_INVARIANT_REDUCTION
|
||||
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
|
||||
if (fi + FSV <= INPUT0_FEATURE_NUM)
|
||||
input = BLOCK_READ(data, offset);
|
||||
else
|
||||
@@ -346,11 +346,9 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
|
||||
for (int i = 0; i < READ_OFFSET; ++i)
|
||||
input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
|
||||
#else
|
||||
input = BLOCK_READ(data, offset);
|
||||
INPUT_VEC input = BLOCK_READ(data, offset);
|
||||
#endif
|
||||
#else
|
||||
INPUT_VEC input = BLOCK_READ(data, offset);
|
||||
#endif
|
||||
|
||||
unroll_for (int i = 0; i < READ_OFFSET; ++i)
|
||||
acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
|
||||
offset += input_x_pitch;
|
||||
@@ -410,7 +408,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // !REDUCE_X
|
||||
}
|
||||
|
||||
#undef SIMD
|
||||
|
||||
@@ -12,6 +12,7 @@ namespace kernel_selector {
|
||||
|
||||
static const size_t SIMD = 16;
|
||||
static const size_t XY_OPT_F_LIMITS = 96;
|
||||
static const size_t AXIS_F = 1;
|
||||
static const size_t AXIS_Y = 2;
|
||||
static const size_t AXIS_X = 3;
|
||||
using NDims = std::vector<kernel_selector::Tensor::Dim>;
|
||||
@@ -78,6 +79,15 @@ static bool can_opt_reduce_xy(const reduce_params& params) {
|
||||
input_dims[1].v <= XY_OPT_F_LIMITS;
|
||||
}
|
||||
|
||||
static bool reducing_unaligned_f_axis(const reduce_params& params) {
|
||||
if (count(params.reduceAxes.begin(), params.reduceAxes.end(), AXIS_F) > 0) {
|
||||
if (params.inputs[0].Feature().v % 16 != 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F16);
|
||||
@@ -216,13 +226,14 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
|
||||
}
|
||||
}
|
||||
|
||||
// MIN/MAX mode should handle feature remainder in case reduce axes includes feature
|
||||
if (params.reduceMode == ReduceMode::MIN || params.reduceMode == ReduceMode::MAX) {
|
||||
if (count(params.reduceAxes.begin(), params.reduceAxes.end(), 1) > 0) {
|
||||
if (params.inputs[0].Feature().v % 16 != 0) {
|
||||
jit.AddConstant(MakeJitConstant("HANDLE_FEATURE_REMAINDER", 1));
|
||||
}
|
||||
}
|
||||
// Some reduction modes are affected by 0 value (e.g. min, max, prod ...)
|
||||
bool zero_invariant_mode = params.reduceMode == ReduceMode::L1 || params.reduceMode == ReduceMode::L2 ||
|
||||
params.reduceMode == ReduceMode::LOG_SUM || params.reduceMode == ReduceMode::LOG_SUM_EXP ||
|
||||
params.reduceMode == ReduceMode::MEAN || params.reduceMode == ReduceMode::OR ||
|
||||
params.reduceMode == ReduceMode::SUM || params.reduceMode == ReduceMode::SUM_SQUARE;
|
||||
|
||||
if (zero_invariant_mode && reducing_unaligned_f_axis(params)) {
|
||||
jit.AddConstant(MakeJitConstant("ZERO_INVARIANT_REDUCTION", 1));
|
||||
}
|
||||
|
||||
return jit;
|
||||
@@ -232,6 +243,8 @@ KernelsData ReduceKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, con
|
||||
KernelsData kds = GetCommonKernelsData(params, options);
|
||||
const reduce_params& orgParams = static_cast<const reduce_params&>(params);
|
||||
|
||||
// To get perf gain of reduction of un-aligned f axis,
|
||||
// Reduce kernel uses 0 value out of range in inner block by disabling re-use memory
|
||||
if (orgParams.inputs[0].Feature().v % 16 != 0) {
|
||||
kds[0].can_reuse_memory = false;
|
||||
}
|
||||
|
||||
@@ -34,8 +34,10 @@ struct reduce_test_params {
|
||||
|
||||
class ReduceFusingTest : public ::BaseFusingTest<reduce_test_params> {
|
||||
public:
|
||||
void execute(reduce_test_params& p, bool is_dynamic = false) {
|
||||
auto input_prim = get_mem(get_input_layout(p));
|
||||
// If an input generator fills values for blocked formats, it sets random values outside of shape.
|
||||
// To avoid this issue made by a generator, it could use a proper planar format given by 'default_format' and add a reorder.
|
||||
void execute(reduce_test_params& p, bool is_dynamic = false, bool use_planar_input = false) {
|
||||
auto input_prim = get_mem(get_input_layout(p, use_planar_input));
|
||||
|
||||
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
|
||||
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
|
||||
@@ -63,8 +65,11 @@ public:
|
||||
return layout{ ov::PartialShape::dynamic(p.in_shape.size()), p.data_type, p.input_format };
|
||||
}
|
||||
|
||||
layout get_input_layout(reduce_test_params& p) {
|
||||
return layout{ p.in_shape, p.data_type, p.input_format };
|
||||
layout get_input_layout(reduce_test_params& p, bool use_planar_input = false) {
|
||||
if (use_planar_input)
|
||||
return layout{ p.in_shape, p.data_type, format::get_default_format(p.input_format)};
|
||||
else
|
||||
return layout{ p.in_shape, p.data_type, p.input_format };
|
||||
}
|
||||
|
||||
layout get_output_layout(reduce_test_params& p) {
|
||||
@@ -113,19 +118,17 @@ public:
|
||||
|
||||
class reduce_eltwise_activation_quantize : public ReduceFusingTest {};
|
||||
TEST_P(reduce_eltwise_activation_quantize, basic) {
|
||||
// TODO: Fix me, refer PR(#15873)
|
||||
if (engine.get_device_info().supports_immad)
|
||||
return;
|
||||
auto p = GetParam();
|
||||
update_out_shape(p);
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
input_layout("input", get_input_layout(p, true)),
|
||||
reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
|
||||
data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -128)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("eltwise_data", get_mem(get_output_layout(p))),
|
||||
reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
|
||||
reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
|
||||
eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
|
||||
activation("activation", input_info("eltwise"), activation_func::relu),
|
||||
quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"),
|
||||
@@ -134,20 +137,22 @@ TEST_P(reduce_eltwise_activation_quantize, basic) {
|
||||
);
|
||||
|
||||
tolerance = 1.f;
|
||||
execute(p);
|
||||
// Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
|
||||
execute(p, false, true);
|
||||
}
|
||||
|
||||
TEST_P(reduce_eltwise_activation_quantize, per_channel) {
|
||||
auto p = GetParam();
|
||||
update_out_shape(p);
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
input_layout("input", get_input_layout(p, true)),
|
||||
reorder("input_reorder", input_info("input"), p.input_format, p.data_type),
|
||||
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -128)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("eltwise_data", get_mem(get_output_layout(p))),
|
||||
reduce("reduce", input_info("input"), p.reduce_mode, p.reduce_axes, p.keep_dims),
|
||||
reduce("reduce", input_info("input_reorder"), p.reduce_mode, p.reduce_axes, p.keep_dims),
|
||||
eltwise("eltwise", { input_info("reduce"), input_info("eltwise_data") }, eltwise_mode::sum, p.default_type),
|
||||
activation("activation", input_info("eltwise"), activation_func::relu),
|
||||
quantize("quantize", input_info("activation"), input_info("in_lo"), input_info("in_hi"), input_info("out_lo"), input_info("out_hi"), 256, data_types::i8),
|
||||
@@ -155,7 +160,8 @@ TEST_P(reduce_eltwise_activation_quantize, per_channel) {
|
||||
);
|
||||
|
||||
tolerance = 1.f;
|
||||
execute(p);
|
||||
// Use a planar input format. It is changed to the 'input_format' by 'input_reorder'
|
||||
execute(p, false, true);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
|
||||
@@ -215,7 +221,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
|
||||
reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { 2, 1, 0 }, true, "reduce_ref" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { 4, 3, 0 }, true, "reduce_ref" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { 3, 2, 1 }, true, "reduce_ref" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 1, 3 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 2, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { 3, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
@@ -228,6 +233,17 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::test
|
||||
reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { 3 }, true, "reduce_gpu_b_fs_yx_fsv16" }
|
||||
}));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu_bf_axis, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector<reduce_test_params>{
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::prod, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::mean, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum_square, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l1, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::l2, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" },
|
||||
reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::log_sum, { 1, 0 }, true, "reduce_gpu_b_fs_yx_fsv16" }
|
||||
}));
|
||||
|
||||
class reduce_scale_activation : public ReduceFusingTest {};
|
||||
TEST_P(reduce_scale_activation, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
Reference in New Issue
Block a user