diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl index 3a98477ffe9..fae9af851aa 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_zyx_fsv16_imad.cl @@ -66,23 +66,23 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD)); #endif - const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z; + const int input_x = out_x * STRIDE_SIZE_X - INPUT0_PAD_BEFORE_SIZE_X; + const int input_y = out_y * STRIDE_SIZE_Y - INPUT0_PAD_BEFORE_SIZE_Y; + const int input_z = out_z * STRIDE_SIZE_Z - INPUT0_PAD_BEFORE_SIZE_Z; #if FEATURE_SLM_SPLIT == 1 - const uint k_start = 0; + const uint in_f_start = 0; #else - const uint k_start = get_sub_group_id() * FSV; + const uint in_f_start = get_sub_group_id() * FSV; #endif - uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0); + uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, in_f_start, 0, 0, 0); const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV); #if INPUT0_DIMS == 4 - uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x); + uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + in_f_start, input_y, input_x); #else - uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x); + uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + in_f_start, input_z, input_y, input_x); #endif ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { }; @@ -110,10 +110,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) { #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0)) if (in_f_offset == 0) { - input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV)); - #else - input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV)); #endif + input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV)); #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0)) } else { INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb]; @@ -122,11 +120,11 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( if (v + in_f_offset < FSV) { input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v]; } else { - input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v + - ((INPUT0_SIZE_X + 2*PADDING_SIZE_X) * - (INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) * - (INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) * - FSV]; + const uint addr = input_idx + get_sub_group_local_id() * FSV + v + + ((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) * + (INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) * + (INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV; + input_int8_arr[v] = conv_input[addr]; } } } @@ -134,10 +132,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( } else { #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0)) if (in_f_offset == 0) { - input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV)); - #else - input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV)); #endif + input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV)); #if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0)) } else { INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb]; @@ -146,11 +142,11 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( if (v + in_f_offset < FSV) { input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v]; } else { - input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v + - ((INPUT0_SIZE_X + 2*PADDING_SIZE_X) * - (INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) * - (INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) * - FSV]; + const uint addr = input_idx + tmp * FSV + v + + ((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) * + (INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) * + (INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV; + input_int8_arr[v] = conv_input[addr]; } } } @@ -183,7 +179,6 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)( for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) { __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) { - const uint ow_offset = ow + OUT_BLOCK_WIDTH; const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z; const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y; const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp index dc6f8edacb9..9cad067300e 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp @@ -7206,15 +7206,6 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, // Input X size, Input Y size, Input Z size, Input features, Output features, // Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch, // Input data format, Implementation name - // Format: b_fs_yx_fsv16 - TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), - TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""), // Format: b_fs_yx_fsv4 TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""), @@ -7227,6 +7218,14 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""), // Format: b_fs_yx_fsv16 + TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""), + TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""), @@ -7249,6 +7248,8 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16, TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""), TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""), TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""), + TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, format::b_fs_zyx_fsv16, ""), + TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, format::b_fs_zyx_fsv16, ""), TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""), TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),