[IE CLDNN] Improve kernel selection for b_fs_yx_fsv16 layout and optimize Convolution kernels (#730)

2020-06-03 13:42:15 +03:00 · 2020-06-03 13:42:15 +03:00 · 20ef9a9423
commit 20ef9a9423
parent b457553593
3 changed files with 28 additions and 31 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
@ -78,6 +78,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
    auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
    kd.cldnnStyle.blockWidth = autoTune.blockWidth;

+    const auto& input = params.inputs[0];
    const auto& out = params.output;
    auto x = out.X().v;
    auto y = out.Y().v;
@ -92,11 +93,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
    kd.lws1 = sub_group_size;
    kd.lws2 = 1;

+    auto bBlockSizeX = x % autoTune.blockWidth == 0;
+    auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
+    auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
+    
    if (b == 1) {
-        if (x <= 8)
+        if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
            kd.efficiency = FORCE_PRIORITY_1;
-        else
-            kd.efficiency = FORCE_PRIORITY_2;
+        } else {
+            kd.efficiency = FORCE_PRIORITY_3;
+        }
    } else {
        kd.efficiency = FORCE_PRIORITY_7;
    }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -161,9 +161,14 @@ KERNEL(convolution_bfyx_f16)(
    vec_t dst = INPUT0_VAL_ZERO;
 #endif  // BIAS_TERM

-#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
-    for (uint g = group; g < group + groups_per_sub_group; g++) {
+#if MULTIPLE_GROUPS_INPUT_PRELOAD
+    const uint in_split_offset = f_block * input_fs_pitch;
+    const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
+#else
 #if GROUPED
+    for (uint g = group; g < group + groups_per_sub_group; g++) {
        const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
        const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
        const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
@ -173,11 +178,6 @@ KERNEL(convolution_bfyx_f16)(
        const uint filter_offset = f_block * filter_os_pitch;
 #endif  // GROUPED
        const uint grouped_filter_offset = filter_offset + filter_split_offset;
-#else
-        const uint in_split_offset = f_block * input_fs_pitch;
-        const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
-        const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
-        const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
 #endif  // MULTIPLE_GROUPS_INPUT_PRELOAD

        const uint grouped_input_offset = input_offset + in_split_offset;
@ -248,7 +248,11 @@ KERNEL(convolution_bfyx_f16)(
                    vec_t src;
                    __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
                    for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
+#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
+                        src[i] = line_cache[i];
+#else
                        src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
+#endif  // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
                    }
 #if MULTIPLE_GROUPS_INPUT_PRELOAD
                    typedef MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_IFM_NUM) ifm_vec_t;
@ -345,9 +349,9 @@ KERNEL(convolution_bfyx_f16)(
                }
            }
        }
-#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
+#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
    }
-#endif  // MULTIPLE_GROUPS_INPUT_PRELOAD
+#endif  // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
    dst = ACTIVATION(dst, ACTIVATION_PARAMS);

    typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
@ -370,7 +374,7 @@ KERNEL(convolution_bfyx_f16)(
    else
 #endif  // OUTPUT_LEFTOVERS
    {
-        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
+        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) {
 #if HAS_FUSED_OPS
            FUSED_OPS_VEC;
            res = FUSED_OPS_RESULT_VEC;
@ -390,8 +394,7 @@ KERNEL(convolution_bfyx_f16)(
 #   error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
 #endif
        } else {
-            const int x_tail = OUTPUT_SIZE_X - x;
-            for (int i = 0; i < x_tail; i++) {
+            for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
 #if HAS_FUSED_OPS
                FUSED_OPS_SCALAR;
                res[i] = FUSED_OPS_RESULT_SCALAR;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
@ -208,21 +208,10 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
 #endif
    {
 #if !PADDED_OUTPUT
-        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) {
-#if HAS_FUSED_OPS
-            FUSED_OPS_VEC;
-            dst = FUSED_OPS_RESULT_VEC;
-#endif
-#if X_BLOCK_SIZE == 8
-            UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#elif X_BLOCK_SIZE == 4
-            UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#elif X_BLOCK_SIZE == 2
-            UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#endif
-        } else {
+        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
 #else
-        if (x * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
+        if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) {
+#endif
 #if HAS_FUSED_OPS
            FUSED_OPS_VEC;
            dst = FUSED_OPS_RESULT_VEC;
@ -235,7 +224,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
            UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
 #endif
        } else {
-#endif
            for (int i = 0; i < X_BLOCK_SIZE; i++) {
                if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
                    return;