[IE CLDNN] Fix accuracy bug in fsv16 imad conv + other minor fixes (#2876)
This commit is contained in:
parent
4d84d7ed1c
commit
c95d8e242d
@ -66,23 +66,23 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
out_f = out_f - (out_f / ALIGN(FILTER_OFM_NUM, SIMD)) * (SIMD - (FILTER_OFM_NUM % SIMD));
|
||||
#endif
|
||||
|
||||
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
const int input_z = out_z * STRIDE_SIZE_Z - PADDING_SIZE_Z;
|
||||
const int input_x = out_x * STRIDE_SIZE_X - INPUT0_PAD_BEFORE_SIZE_X;
|
||||
const int input_y = out_y * STRIDE_SIZE_Y - INPUT0_PAD_BEFORE_SIZE_Y;
|
||||
const int input_z = out_z * STRIDE_SIZE_Z - INPUT0_PAD_BEFORE_SIZE_Z;
|
||||
|
||||
#if FEATURE_SLM_SPLIT == 1
|
||||
const uint k_start = 0;
|
||||
const uint in_f_start = 0;
|
||||
#else
|
||||
const uint k_start = get_sub_group_id() * FSV;
|
||||
const uint in_f_start = get_sub_group_id() * FSV;
|
||||
#endif
|
||||
|
||||
uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, k_start, 0, 0, 0);
|
||||
uint filter_idx = GET_FILTER_G_OS_IS_ZYX_OSV16_ISV16_INDEX(FILTER, g, out_f_g, in_f_start, 0, 0, 0);
|
||||
const uint filter_idx_diff = (ALIGN(FILTER_IFM_NUM, FSV) * FILTER_SIZE_X * FILTER_SIZE_Y * FILTER_SIZE_Z * FSV);
|
||||
|
||||
#if INPUT0_DIMS == 4
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_y, input_x);
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + in_f_start, input_y, input_x);
|
||||
#else
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + k_start, input_z, input_y, input_x);
|
||||
uint input_start_idx = INPUT0_GET_INDEX(out_b, g * FILTER_IFM_NUM + in_f_start, input_z, input_y, input_x);
|
||||
#endif
|
||||
|
||||
ACCUMULATOR_TYPE dotProd[OFM_BLOCKS_PER_SIMD][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH] = { };
|
||||
@ -110,10 +110,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
if (ixb != CEIL_DIV(IN_BLOCK_WIDTH, SIMD) - 1) {
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
if (in_f_offset == 0) {
|
||||
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#else
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#endif
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint *)(conv_input + input_idx + get_sub_group_local_id() * FSV));
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
} else {
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
|
||||
@ -122,11 +120,11 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v];
|
||||
} else {
|
||||
input_int8_arr[v] = conv_input[input_idx + get_sub_group_local_id() * FSV + v +
|
||||
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
|
||||
FSV];
|
||||
const uint addr = input_idx + get_sub_group_local_id() * FSV + v +
|
||||
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
|
||||
input_int8_arr[v] = conv_input[addr];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -134,10 +132,8 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
} else {
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
if (in_f_offset == 0) {
|
||||
input_val[izb][iyb][ixb] = as_uint4(vload16(0, conv_input + input_idx + tmp * FSV));
|
||||
#else
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
|
||||
#endif
|
||||
input_val[izb][iyb][ixb] = vload4(0, (__global uint*)(conv_input + input_idx + tmp * FSV));
|
||||
#if ((FILTER_GROUPS_NUM > 1) && (FILTER_IFM_NUM % FSV != 0))
|
||||
} else {
|
||||
INPUT0_TYPE* input_int8_arr = (INPUT0_TYPE*) &input_val[izb][iyb][ixb];
|
||||
@ -146,11 +142,11 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
if (v + in_f_offset < FSV) {
|
||||
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v];
|
||||
} else {
|
||||
input_int8_arr[v] = conv_input[input_idx + tmp * FSV + v +
|
||||
((INPUT0_SIZE_X + 2*PADDING_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + 2*PADDING_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + 2*PADDING_SIZE_Z) - 1) *
|
||||
FSV];
|
||||
const uint addr = input_idx + tmp * FSV + v +
|
||||
((INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) *
|
||||
(INPUT0_SIZE_Y + INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) *
|
||||
(INPUT0_SIZE_Z + INPUT0_PAD_BEFORE_SIZE_Z + INPUT0_PAD_AFTER_SIZE_Z) - 1) * FSV;
|
||||
input_int8_arr[v] = conv_input[addr];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -183,7 +179,6 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
const uint ow_offset = ow + OUT_BLOCK_WIDTH;
|
||||
const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
|
||||
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
|
||||
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
|
||||
|
@ -7206,15 +7206,6 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
// Input X size, Input Y size, Input Z size, Input features, Output features,
|
||||
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
|
||||
// Input data format, Implementation name
|
||||
// Format: b_fs_yx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
|
||||
|
||||
// Format: b_fs_yx_fsv4
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv4, ""),
|
||||
@ -7227,6 +7218,14 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
TestParamType_grouped_convolution_gpu(6, 6, 1, 8, 26, 3, 3, 1, 2, 4, 1, format::b_fs_yx_fsv4, ""),
|
||||
|
||||
// Format: b_fs_yx_fsv16
|
||||
TestParamType_grouped_convolution_gpu(12, 12, 1, 96, 96, 3, 3, 1, 32, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 16, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(7, 7, 1, 8, 4, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(5, 5, 1, 34, 12, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(8, 8, 1, 34, 24, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 2, 1, 12, 12, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 3, 1, 8, 8, 3, 3, 1, 2, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 2, 4, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 17, 3, 3, 1, 1, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 16, 16, 3, 3, 1, 4, 1, 1, format::b_fs_yx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 4, 1, 8, 4, 2, 2, 1, 2, 1, 4, format::b_fs_yx_fsv16, ""),
|
||||
@ -7249,6 +7248,8 @@ INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
|
||||
TestParamType_grouped_convolution_gpu(8, 6, 4, 32, 64, 2, 2, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 6, 4, 16, 32, 4, 3, 2, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(33, 1, 1, 30, 62, 1, 1, 1, 2, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(80, 1, 1, 48, 96, 33, 1, 1, 2, 8, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(2, 1, 1, 18, 32, 3, 1, 1, 2, 2, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(3, 1, 5, 196, 252, 3, 1, 3, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 256, 2, 1, 2, 4, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
TestParamType_grouped_convolution_gpu(4, 1, 6, 256, 512, 2, 1, 3, 16, 1, 1, format::b_fs_zyx_fsv16, ""),
|
||||
|
Loading…
Reference in New Issue
Block a user