[GPU] Add input feature leftovers processing for fully_connected_gpu_bs_f_bsv16_af8_vload kernel (#19650)

2023-09-07 13:20:11 +04:00 · 2023-09-07 13:20:11 +04:00 · 4eb9c57424
commit 4eb9c57424
parent 4124851d2b
2 changed files with 31 additions and 2 deletions
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bs_f_bsv16_af8_vload.cl
@ -31,6 +31,8 @@
 }
 #define SUB_GROUP_SIZE 16
 #define INPUT_FEATURE_ALIGNMENT 8
 #define ALIGNED_INPUT0_ELEMENTS_COUNT ALIGN(INPUT0_ELEMENTS_COUNT, INPUT_FEATURE_ALIGNMENT)
 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
 REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
@ -59,9 +61,9 @@ KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
    MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
-    uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * INPUT0_ELEMENTS_COUNT;
+    uint weight_offset = id_in_sub_group + SUB_GROUP_SIZE * group_id * ALIGNED_INPUT0_ELEMENTS_COUNT;
-    uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * INPUT0_ELEMENTS_COUNT;
+    uint input_idx = id_in_sub_group + batch_group_id * BATCHES_PER_WORK_ITEM * ALIGNED_INPUT0_ELEMENTS_COUNT;
    for(uint h = 0; h < INPUT0_ELEMENTS_COUNT / 8; h++)
    {
        // read input data in blocks ( 16 batch * 8 x )
@ -74,6 +76,19 @@ KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
        input_idx     += 128; // 128 = 16 x 8 - because of input format which have blocks of 128 elements
    }
 #if ALIGNED_INPUT0_ELEMENTS_COUNT != INPUT0_ELEMENTS_COUNT
    {
        // Processing of leftover input features
        MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00 = UNIT_VAL_ZERO;
        for (uint idx = 0; idx < INPUT0_ELEMENTS_COUNT % INPUT_FEATURE_ALIGNMENT; idx++) {
            blockA00[idx] = input[input_idx + idx * SUB_GROUP_SIZE];
        }
        MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00 = ALIGNED_BLOCK_READ8(weight, weight_offset);
        MULTIPLY_BLOCKS_16x8(blockC00, blockA00, blockB00)
    }
 #endif
 #if BIAS_TERM
    blockC00 += bias[neuronIdx];
 #endif // #if BIAS_TERM
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@ -1079,6 +1079,20 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::Values(false))
 );
 INSTANTIATE_TEST_SUITE_P(
    smoke_fully_connected_gpu_bs_f_bsv16_af8_vload,
    fully_connected_random_test_f16,
    ::testing::Combine(
        ::testing::Values(16),
        ::testing::Values(shared_dims{3, 1, 1},
                          shared_dims{17, 1, 1}),
        ::testing::Values(3, 32),
        ::testing::Values(format::bfyx),
        ::testing::Values(format::any),
        ::testing::Values("fully_connected_gpu_bs_f_bsv16_af8_vload"),
        ::testing::Values(false))
 );
 INSTANTIATE_TEST_SUITE_P(
    smoke,
    fully_connected_random_test_f16,