[IE CLDNN] Fix accuracy bug in fsv16 imad conv + other minor fixes (#2876)

This commit is contained in:
Jedrzej Hajduczenia 2020-10-29 07:33:05 +01:00 committed by GitHub
parent 4d84d7ed1c
commit c95d8e242d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 34 deletions

View File

@ -66,23 +66,23 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD));
#endif
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
const int input_x = out_x * STRIDE_SIZE_X - INPUT0_PAD_BEFORE_SIZE_X;
const int input_y = out_y * STRIDE_SIZE_Y - INPUT0_PAD_BEFORE_SIZE_Y;
const int input_z = out_z * STRIDE_SIZE_Z - INPUT0_PAD_BEFORE_SIZE_Z;
#if FEATURE_SLM_SPLIT == 1
const uint k_start = 0;
const uint in_f_start = 0;
#else
const uint k_start = get_sub_group_id() * FSV;
const uint in_f_start = get_sub_group_id() * FSV;
#endif
uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0);
uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, in_f_start, 0, 0, 0);
const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV);
#if INPUT0_DIMS == 4
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x);
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + in_f_start, input_y, input_x);
#else
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x);
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + in_f_start, input_z, input_y, input_x);
#endif
ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
@ -110,10 +110,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV));
#else
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
#endif
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
@ -122,11 +120,11 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
} else {
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v +
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
FSV];
const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
input_int8_arr[v] = conv_input[addr];
}
}
}
@ -134,10 +132,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
} else {
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
if (in_f_offset == 0) {
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV));
#else
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
#endif
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
} else {
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
@ -146,11 +142,11 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
if (v + in_f_offset < FSV) {
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
} else {
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v +
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
FSV];
const uint addr = input_idx + tmp * FSV + v +
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
input_int8_arr[v] = conv_input[addr];
}
}
}
@ -183,7 +179,6 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;

View File

@ -7206,15 +7206,6 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
// Input X size, Input Y size, Input Z size, Input features, Output features,
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
// Input data format, Implementation name
// Format: b_fs_yx_fsv16
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
// Format: b_fs_yx_fsv4
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
@ -7227,6 +7218,14 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
// Format: b_fs_yx_fsv16
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
@ -7249,6 +7248,8 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),