[GPU] Remove remainder handling to read in reduce kernel to improve performance (#9359)

Signed-off-by: Kelvin Choi <kelvin.choi@intel.com>
This commit is contained in:
Kelvin Choi 2021-12-29 22:25:34 +09:00 committed by GitHub
parent 39a1b98799
commit 4505f5d7e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -341,17 +341,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
for (uint fi = feature_out; fi < feature_max_val; fi += FSV) {
for (uint yi = y_out; yi < y_max_val; ++yi) {
for (uint xi = x_out; xi < x_max_val; ++xi) {
INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL);
#if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0)
if (fi + FSV <= INPUT0_FEATURE_NUM)
input = BLOCK_READ(data, offset);
else
if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM)
for (int i = 0; i < READ_OFFSET; ++i)
input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()];
#else
input = BLOCK_READ(data, offset);
#endif
INPUT_VEC input = BLOCK_READ(data, offset);
unroll_for (int i = 0; i < READ_OFFSET; ++i)
acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]);
offset += input_x_pitch;