[IE CLDNN] Improve kernel selection for b_fs_yx_fsv16 layout and optimize Convolution kernels (#730)

This commit is contained in:
Sergey Shlyapnikov 2020-06-03 13:42:15 +03:00 committed by GitHub
parent b457553593
commit 20ef9a9423
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 28 additions and 31 deletions

View File

@ -78,6 +78,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
kd.cldnnStyle.blockWidth = autoTune.blockWidth;
const auto& input = params.inputs[0];
const auto& out = params.output;
auto x = out.X().v;
auto y = out.Y().v;
@ -92,11 +93,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
kd.lws1 = sub_group_size;
kd.lws2 = 1;
auto bBlockSizeX = x % autoTune.blockWidth == 0;
auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
if (b == 1) {
if (x <= 8)
if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
kd.efficiency = FORCE_PRIORITY_1;
else
kd.efficiency = FORCE_PRIORITY_2;
} else {
kd.efficiency = FORCE_PRIORITY_3;
}
} else {
kd.efficiency = FORCE_PRIORITY_7;
}

View File

@ -1,4 +1,4 @@
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -161,9 +161,14 @@ KERNEL(convolution_bfyx_f16)(
vec_t dst = INPUT0_VAL_ZERO;
#endif // BIAS_TERM
#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
for (uint g = group; g < group + groups_per_sub_group; g++) {
#if MULTIPLE_GROUPS_INPUT_PRELOAD
const uint in_split_offset = f_block * input_fs_pitch;
const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
#else
#if GROUPED
for (uint g = group; g < group + groups_per_sub_group; g++) {
const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
@ -173,11 +178,6 @@ KERNEL(convolution_bfyx_f16)(
const uint filter_offset = f_block * filter_os_pitch;
#endif // GROUPED
const uint grouped_filter_offset = filter_offset + filter_split_offset;
#else
const uint in_split_offset = f_block * input_fs_pitch;
const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
#endif // MULTIPLE_GROUPS_INPUT_PRELOAD
const uint grouped_input_offset = input_offset + in_split_offset;
@ -248,7 +248,11 @@ KERNEL(convolution_bfyx_f16)(
vec_t src;
__attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
src[i] = line_cache[i];
#else
src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
#endif // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
}
#if MULTIPLE_GROUPS_INPUT_PRELOAD
typedef MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_IFM_NUM) ifm_vec_t;
@ -345,9 +349,9 @@ KERNEL(convolution_bfyx_f16)(
}
}
}
#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
}
#endif // MULTIPLE_GROUPS_INPUT_PRELOAD
#endif // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
dst = ACTIVATION(dst, ACTIVATION_PARAMS);
typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
@ -370,7 +374,7 @@ KERNEL(convolution_bfyx_f16)(
else
#endif // OUTPUT_LEFTOVERS
{
if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) {
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
res = FUSED_OPS_RESULT_VEC;
@ -390,8 +394,7 @@ KERNEL(convolution_bfyx_f16)(
# error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
#endif
} else {
const int x_tail = OUTPUT_SIZE_X - x;
for (int i = 0; i < x_tail; i++) {
for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
#if HAS_FUSED_OPS
FUSED_OPS_SCALAR;
res[i] = FUSED_OPS_RESULT_SCALAR;

View File

@ -208,21 +208,10 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
#endif
{
#if !PADDED_OUTPUT
if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) {
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
dst = FUSED_OPS_RESULT_VEC;
#endif
#if X_BLOCK_SIZE == 8
UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#elif X_BLOCK_SIZE == 4
UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#elif X_BLOCK_SIZE == 2
UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#endif
} else {
if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
#else
if (x * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) {
#endif
#if HAS_FUSED_OPS
FUSED_OPS_VEC;
dst = FUSED_OPS_RESULT_VEC;
@ -235,7 +224,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
#endif
} else {
#endif
for (int i = 0; i < X_BLOCK_SIZE; i++) {
if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
return;