[GPU] Add input feature leftovers processing for fully_connected_gpu_bs_f_bsv16_af8_vload kernel (#19650)
This commit is contained in:
parent
4124851d2b
commit
4eb9c57424
@ -31,6 +31,8 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define SUB_GROUP_SIZE 16
|
#define SUB_GROUP_SIZE 16
|
||||||
|
#define INPUT_FEATURE_ALIGNMENT 8
|
||||||
|
#define ALIGNED_INPUT0_ELEMENTS_COUNT ALIGN(INPUT0_ELEMENTS_COUNT, INPUT_FEATURE_ALIGNMENT)
|
||||||
|
|
||||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||||
@ -59,9 +61,9 @@ KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
|
|||||||
|
|
||||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
|
MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
|
||||||
|
|
||||||
uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * INPUT0_ELEMENTS_COUNT;
|
uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * ALIGNED_INPUT0_ELEMENTS_COUNT;
|
||||||
|
|
||||||
uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
|
uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * ALIGNED_INPUT0_ELEMENTS_COUNT;
|
||||||
for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
|
for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
|
||||||
{
|
{
|
||||||
// read input data in blocks ( 16 batch * 8 x )
|
// read input data in blocks ( 16 batch * 8 x )
|
||||||
@ -74,6 +76,19 @@ KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
|
|||||||
input_idx += 128; // 128 = 16 x 8 - because of input format which have blocks of 128 elements
|
input_idx += 128; // 128 = 16 x 8 - because of input format which have blocks of 128 elements
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if ALIGNED_INPUT0_ELEMENTS_COUNT != INPUT0_ELEMENTS_COUNT
|
||||||
|
{
|
||||||
|
// Processing of leftover input features
|
||||||
|
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = UNIT_VAL_ZERO;
|
||||||
|
for (uint idx = 0; idx < INPUT0_ELEMENTS_COUNT % INPUT_FEATURE_ALIGNMENT; idx++) {
|
||||||
|
blockA00[idx] = input[input_idx + idx * SUB_GROUP_SIZE];
|
||||||
|
}
|
||||||
|
|
||||||
|
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00 = ALIGNED_BLOCK_READ8(weight, weight_offset);
|
||||||
|
MULTIPLY_BLOCKS_16x8(blockC00, blockA00, blockB00)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if BIAS_TERM
|
#if BIAS_TERM
|
||||||
blockC00 += bias[neuronIdx];
|
blockC00 += bias[neuronIdx];
|
||||||
#endif // #if BIAS_TERM
|
#endif // #if BIAS_TERM
|
||||||
|
@ -1079,6 +1079,20 @@ INSTANTIATE_TEST_SUITE_P(
|
|||||||
::testing::Values(false))
|
::testing::Values(false))
|
||||||
);
|
);
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
|
smoke_fully_connected_gpu_bs_f_bsv16_af8_vload,
|
||||||
|
fully_connected_random_test_f16,
|
||||||
|
::testing::Combine(
|
||||||
|
::testing::Values(16),
|
||||||
|
::testing::Values(shared_dims{3, 1, 1},
|
||||||
|
shared_dims{17, 1, 1}),
|
||||||
|
::testing::Values(3, 32),
|
||||||
|
::testing::Values(format::bfyx),
|
||||||
|
::testing::Values(format::any),
|
||||||
|
::testing::Values("fully_connected_gpu_bs_f_bsv16_af8_vload"),
|
||||||
|
::testing::Values(false))
|
||||||
|
);
|
||||||
|
|
||||||
INSTANTIATE_TEST_SUITE_P(
|
INSTANTIATE_TEST_SUITE_P(
|
||||||
smoke,
|
smoke,
|
||||||
fully_connected_random_test_f16,
|
fully_connected_random_test_f16,
|
||||||
|
Loading…
Reference in New Issue
Block a user