From 2b8249fc9f899dea2270b2dbfda6840ac1151990 Mon Sep 17 00:00:00 2001 From: Konrad Dobros Date: Mon, 31 Aug 2020 08:52:47 +0200 Subject: [PATCH] [IE CLDNN] Improve performance of fc block fp16 implementation (#1993) Main purpose of this change is to fix weird behaviour of fully_connected_gpu_fb_io_block_fp16 implementation where it shows severe performance drop without bias. Additionally assembly for case with bias is improved. --- .../cl_kernels/fully_connected_gpu_fb_io_block_fp16.cl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_block_fp16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_block_fp16.cl index cbf266101bc..5257d6a7f72 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_block_fp16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_fb_io_block_fp16.cl @@ -214,8 +214,9 @@ KERNEL (fully_connected_gpu_xb_xb_block_fp16)( CHUNK_TYPE acc[UNITS_PER_SG_READ] = {}; // Iterate over yxf linear plane (both filters/weights and input). - for (uint input_offset = input_base, filter_offset = filter_base; input_offset < input_byte_size; input_offset += input_sg_reads_distance) - { + uint input_offset = input_base; + uint filter_offset = filter_base; + do { CHUNK_TYPE input_val = INPUT0_READ(input, input_offset + sg_elem_offset); // Iterate over filters needed to process input read by sub-group. @@ -233,7 +234,9 @@ KERNEL (fully_connected_gpu_xb_xb_block_fp16)( acc[acc_pos] = AS_CHUNK(fma(AS_UNITS(input_val), SG_UNIT_SELECT(filter_val, acc_pos), AS_UNITS(acc[acc_pos]))); } } - } + + input_offset += input_sg_reads_distance; + } while (input_offset < input_byte_size); // WRITE OUTPUT // BATCH = 32x? (HF) / 16x? (F)