From 5d6fa64621f673fec0351d522313c2443fea6471 Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Thu, 8 Jul 2021 18:25:16 +0900
Subject: [PATCH]  [IE CLDNN] Improve bfyx convolution performance for shallow
 output ch (#6399)

---
 .../convolution_kernel_bfyx_os_iyx_osv16.cpp      | 15 +++++++++++----
 .../convolution_kernel_bfyx_os_iyx_osv16.h        | 11 +++++++++++
 .../convolution_gpu_bfyx_os_iyx_osv16.cl          |  9 ++++-----
 3 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
index a80696c0b60..bef3794a657 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
@@ -9,7 +9,6 @@
 
 namespace kernel_selector {
 // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
-constexpr size_t sub_group_size = 16;
 
 ConvolutionKernel_bfyx_os_iyx_osv16::ConvolutionKernel_bfyx_os_iyx_osv16()
     : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16") {
@@ -94,6 +93,9 @@ static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_
 
     block_x -= unused_x / simds_x;
     block_y -= unused_y / simds_y;
+
+    block_x = Align(block_x, 2);
+    block_y = Align(block_y, 2);
 }
 
 ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(
@@ -107,9 +109,11 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
 
     const convolution_params& cp = static_cast<const convolution_params&>(p);
 
+    const auto& sub_group_size = GetSubGroupSize(cp);
+
     if (cp.stride.x == 1 && cp.stride.y == 1) {
         if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
-            option.blockWidth = 16;
+            option.blockWidth = sub_group_size;
             option.blockHeight = 1;
             option.prefetch = 4;
         // if less than 16 values is required to compute one single row of output
@@ -143,13 +147,13 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
     if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
         shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
     }
-
     return option;
 }
 
 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16::SetDefault(const convolution_params& cp,
                                                                                     int autoTuneIndex) const {
     DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
+    const auto& sub_group_size = GetSubGroupSize(cp);
 
     const auto of_maps = cp.output.Feature().v;
     const auto of_maps_per_group = of_maps / cp.groups;
@@ -196,6 +200,9 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p, const option
 
 JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolution_params& params,
                                                                   const DispatchData& dispatchData) const {
+    const convolution_params& cp = static_cast<const convolution_params&>(params);
+    const auto& sub_group_size = GetSubGroupSize(cp);
+
     const auto of_maps = params.output.Feature().v;
     const auto of_maps_per_group = of_maps / params.groups;
     const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size);
@@ -209,7 +216,7 @@ JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolut
         jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
     }
 
-
+    jit.AddConstant(MakeJitConstant("OSV_SIZE", 16));
     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
     jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
     jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
index 6bc617bb3f3..9da52609636 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "api/cldnn/runtime/device_info.hpp"
 #include "convolution_kernel_base.h"
 #include <string>
 #include <vector>
@@ -34,6 +35,16 @@ protected:
     bool Validate(const Params& p, const optional_params& o) const override;
     bool NeedPaddedInput() const override { return true; }
     DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
+    size_t GetSubGroupSize(const convolution_params& params) const {
+        if (params.engineInfo.computeUnitsCount <= 24) {
+            // Smaller # EU tends to be computation bounds.
+            // In such case, using larger worksize will result in larger computational inefficiency
+            // w.r.t the unalined output feature
+            return (params.output.Feature().v > 8) ? 16 : 8;
+        } else {
+            return 16;
+        }
+    }
 
 private:
     struct AutoTuneOption {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
index 679fa45de52..b622f1c1698 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
@@ -83,12 +83,11 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
     uint fmg = feature_idx / SUB_GROUP_SIZE;
     const uint g = split_idx;
 #endif
-
     UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
     UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
     UNIT_TYPE w[PREFETCH];
     uint in_addr;
-    uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
+    uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * OSV_SIZE + lid;
 
 #if GROUPED
     weight_addr += g * FILTER_GROUPS_PITCH;
@@ -156,7 +155,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
         in_addr += INPUT0_FEATURE_PITCH;
 
         for(int pf=0; pf<PREFETCH; pf++) {
-            w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
+            w[pf] = weights[weight_addr]; weight_addr += OSV_SIZE;
         }
 
         uint wi = 0;
@@ -182,12 +181,12 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
                     }
                 }
                 w[wi % PREFETCH] = weights[weight_addr];
-                weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
+                weight_addr += OSV_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                 wi++;
             });
         });
         // addr went beyond due to prefetch so move it back to correct location.
-        weight_addr -= PREFETCH * SUB_GROUP_SIZE;
+        weight_addr -= PREFETCH * OSV_SIZE;
     }
 
     uint out_split_offset = g * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;