From 5d6fa64621f673fec0351d522313c2443fea6471 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Thu, 8 Jul 2021 18:25:16 +0900 Subject: [PATCH] [IE CLDNN] Improve bfyx convolution performance for shallow output ch (#6399) --- .../convolution_kernel_bfyx_os_iyx_osv16.cpp | 15 +++++++++++---- .../convolution_kernel_bfyx_os_iyx_osv16.h | 11 +++++++++++ .../convolution_gpu_bfyx_os_iyx_osv16.cl | 9 ++++----- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp index a80696c0b60..bef3794a657 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp @@ -9,7 +9,6 @@ namespace kernel_selector { // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel. -constexpr size_t sub_group_size = 16; ConvolutionKernel_bfyx_os_iyx_osv16::ConvolutionKernel_bfyx_os_iyx_osv16() : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16") { @@ -94,6 +93,9 @@ static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_ block_x -= unused_x / simds_x; block_y -= unused_y / simds_y; + + block_x = Align(block_x, 2); + block_y = Align(block_y, 2); } ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16::GetAutoTuneOptions( @@ -107,9 +109,11 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy const convolution_params& cp = static_cast(p); + const auto& sub_group_size = GetSubGroupSize(cp); + if (cp.stride.x == 1 && cp.stride.y == 1) { if (cp.filterSize.x == 1 && cp.filterSize.y == 1) { - option.blockWidth = 16; + option.blockWidth = sub_group_size; option.blockHeight = 1; option.prefetch = 4; // if less than 16 values is required to compute one single row of output @@ -143,13 +147,13 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) { shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight); } - return option; } ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16::SetDefault(const convolution_params& cp, int autoTuneIndex) const { DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp); + const auto& sub_group_size = GetSubGroupSize(cp); const auto of_maps = cp.output.Feature().v; const auto of_maps_per_group = of_maps / cp.groups; @@ -196,6 +200,9 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p, const option JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const { + const convolution_params& cp = static_cast(params); + const auto& sub_group_size = GetSubGroupSize(cp); + const auto of_maps = params.output.Feature().v; const auto of_maps_per_group = of_maps / params.groups; const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size); @@ -209,7 +216,7 @@ JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolut jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar})); } - + jit.AddConstant(MakeJitConstant("OSV_SIZE", 16)); jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2])); jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth)); jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight)); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h index 6bc617bb3f3..9da52609636 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h @@ -4,6 +4,7 @@ #pragma once +#include "api/cldnn/runtime/device_info.hpp" #include "convolution_kernel_base.h" #include #include @@ -34,6 +35,16 @@ protected: bool Validate(const Params& p, const optional_params& o) const override; bool NeedPaddedInput() const override { return true; } DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; + size_t GetSubGroupSize(const convolution_params& params) const { + if (params.engineInfo.computeUnitsCount <= 24) { + // Smaller # EU tends to be computation bounds. + // In such case, using larger worksize will result in larger computational inefficiency + // w.r.t the unalined output feature + return (params.output.Feature().v > 8) ? 16 : 8; + } else { + return 16; + } + } private: struct AutoTuneOption { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl index 679fa45de52..b622f1c1698 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl @@ -83,12 +83,11 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( uint fmg = feature_idx / SUB_GROUP_SIZE; const uint g = split_idx; #endif - UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE]; UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT]; UNIT_TYPE w[PREFETCH]; uint in_addr; - uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid; + uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * OSV_SIZE + lid; #if GROUPED weight_addr += g * FILTER_GROUPS_PITCH; @@ -156,7 +155,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)( in_addr += INPUT0_FEATURE_PITCH; for(int pf=0; pf