[GPU] Add input feature leftovers processing for fully_connected_gpu_bs_f_bsv16_af8_vload kernel (#19650)

This commit is contained in:
Sergey Shlyapnikov 2023-09-07 13:20:11 +04:00 committed by GitHub
parent 4124851d2b
commit 4eb9c57424
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 2 deletions

View File

@ -31,6 +31,8 @@
}
#define SUB_GROUP_SIZE 16
#define INPUT_FEATURE_ALIGNMENT 8
#define ALIGNED_INPUT0_ELEMENTS_COUNT ALIGN(INPUT0_ELEMENTS_COUNT, INPUT_FEATURE_ALIGNMENT)
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
@ -59,9 +61,9 @@ KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * INPUT0_ELEMENTS_COUNT;
uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * ALIGNED_INPUT0_ELEMENTS_COUNT;
uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * ALIGNED_INPUT0_ELEMENTS_COUNT;
for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
{
// read input data in blocks ( 16 batch * 8 x )
@ -74,6 +76,19 @@ KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
input_idx += 128; // 128 = 16 x 8 - because of input format which have blocks of 128 elements
}
#if ALIGNED_INPUT0_ELEMENTS_COUNT != INPUT0_ELEMENTS_COUNT
{
// Processing of leftover input features
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = UNIT_VAL_ZERO;
for (uint idx = 0; idx < INPUT0_ELEMENTS_COUNT % INPUT_FEATURE_ALIGNMENT; idx++) {
blockA00[idx] = input[input_idx + idx * SUB_GROUP_SIZE];
}
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00 = ALIGNED_BLOCK_READ8(weight, weight_offset);
MULTIPLY_BLOCKS_16x8(blockC00, blockA00, blockB00)
}
#endif
#if BIAS_TERM
blockC00 += bias[neuronIdx];
#endif // #if BIAS_TERM

View File

@ -1079,6 +1079,20 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(false))
);
INSTANTIATE_TEST_SUITE_P(
smoke_fully_connected_gpu_bs_f_bsv16_af8_vload,
fully_connected_random_test_f16,
::testing::Combine(
::testing::Values(16),
::testing::Values(shared_dims{3, 1, 1},
shared_dims{17, 1, 1}),
::testing::Values(3, 32),
::testing::Values(format::bfyx),
::testing::Values(format::any),
::testing::Values("fully_connected_gpu_bs_f_bsv16_af8_vload"),
::testing::Values(false))
);
INSTANTIATE_TEST_SUITE_P(
smoke,
fully_connected_random_test_f16,