[IE CLDNN] Improve bfyx convolution performance for shallow output ch (#6399)

2021-07-08 18:25:16 +09:00 · 2021-07-08 18:25:16 +09:00 · 5d6fa64621
commit 5d6fa64621
parent afe60b3263
3 changed files with 26 additions and 9 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
@ -9,7 +9,6 @@

 namespace kernel_selector {
 // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
-constexpr size_t sub_group_size = 16;

 ConvolutionKernel_bfyx_os_iyx_osv16::ConvolutionKernel_bfyx_os_iyx_osv16()
    : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16") {
@ -94,6 +93,9 @@ static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_

    block_x -= unused_x / simds_x;
    block_y -= unused_y / simds_y;
+
+    block_x = Align(block_x, 2);
+    block_y = Align(block_y, 2);
 }

 ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(
@ -107,9 +109,11 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy

    const convolution_params& cp = static_cast<const convolution_params&>(p);

+    const auto& sub_group_size = GetSubGroupSize(cp);
+
    if (cp.stride.x == 1 && cp.stride.y == 1) {
        if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
-            option.blockWidth = 16;
+            option.blockWidth = sub_group_size;
            option.blockHeight = 1;
            option.prefetch = 4;
        // if less than 16 values is required to compute one single row of output
@ -143,13 +147,13 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
    if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
        shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
    }
-
    return option;
 }

 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16::SetDefault(const convolution_params& cp,
                                                                                    int autoTuneIndex) const {
    DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
+    const auto& sub_group_size = GetSubGroupSize(cp);

    const auto of_maps = cp.output.Feature().v;
    const auto of_maps_per_group = of_maps / cp.groups;
@ -196,6 +200,9 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p, const option

 JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolution_params& params,
                                                                  const DispatchData& dispatchData) const {
+    const convolution_params& cp = static_cast<const convolution_params&>(params);
+    const auto& sub_group_size = GetSubGroupSize(cp);
+
    const auto of_maps = params.output.Feature().v;
    const auto of_maps_per_group = of_maps / params.groups;
    const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size);
@ -209,7 +216,7 @@ JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolut
        jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
    }

-
+    jit.AddConstant(MakeJitConstant("OSV_SIZE", 16));
    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
    jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
    jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
@ -4,6 +4,7 @@

 #pragma once

+#include "api/cldnn/runtime/device_info.hpp"
 #include "convolution_kernel_base.h"
 #include <string>
 #include <vector>
@ -34,6 +35,16 @@ protected:
    bool Validate(const Params& p, const optional_params& o) const override;
    bool NeedPaddedInput() const override { return true; }
    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
+    size_t GetSubGroupSize(const convolution_params& params) const {
+        if (params.engineInfo.computeUnitsCount <= 24) {
+            // Smaller # EU tends to be computation bounds.
+            // In such case, using larger worksize will result in larger computational inefficiency
+            // w.r.t the unalined output feature
+            return (params.output.Feature().v > 8) ? 16 : 8;
+        } else {
+            return 16;
+        }
+    }

 private:
    struct AutoTuneOption {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
@ -83,12 +83,11 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
    uint fmg = feature_idx / SUB_GROUP_SIZE;
    const uint g = split_idx;
 #endif
-
    UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
    UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
    UNIT_TYPE w[PREFETCH];
    uint in_addr;
-    uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
+    uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * OSV_SIZE + lid;

 #if GROUPED
    weight_addr += g * FILTER_GROUPS_PITCH;
@ -156,7 +155,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
        in_addr += INPUT0_FEATURE_PITCH;

        for(int pf=0; pf<PREFETCH; pf++) {
-            w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
+            w[pf] = weights[weight_addr]; weight_addr += OSV_SIZE;
        }

        uint wi = 0;
@ -182,12 +181,12 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
                    }
                }
                w[wi % PREFETCH] = weights[weight_addr];
-                weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
+                weight_addr += OSV_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                wi++;
            });
        });
        // addr went beyond due to prefetch so move it back to correct location.
-        weight_addr -= PREFETCH * SUB_GROUP_SIZE;
+        weight_addr -= PREFETCH * OSV_SIZE;
    }

    uint out_split_offset = g * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;