[IE CLDNN] Improve bfyx convolution performance for shallow output ch (#6399)

This commit is contained in:
Taylor Yeonbok Lee 2021-07-08 18:25:16 +09:00 committed by GitHub
parent afe60b3263
commit 5d6fa64621
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 26 additions and 9 deletions

View File

@ -9,7 +9,6 @@
namespace kernel_selector {
// Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
constexpr size_t sub_group_size = 16;
ConvolutionKernel_bfyx_os_iyx_osv16::ConvolutionKernel_bfyx_os_iyx_osv16()
: ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16") {
@ -94,6 +93,9 @@ static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_
block_x -= unused_x / simds_x;
block_y -= unused_y / simds_y;
block_x = Align(block_x, 2);
block_y = Align(block_y, 2);
}
ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(
@ -107,9 +109,11 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
const convolution_params& cp = static_cast<const convolution_params&>(p);
const auto& sub_group_size = GetSubGroupSize(cp);
if (cp.stride.x == 1 && cp.stride.y == 1) {
if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
option.blockWidth = 16;
option.blockWidth = sub_group_size;
option.blockHeight = 1;
option.prefetch = 4;
// if less than 16 values is required to compute one single row of output
@ -143,13 +147,13 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
}
return option;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16::SetDefault(const convolution_params& cp,
int autoTuneIndex) const {
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
const auto& sub_group_size = GetSubGroupSize(cp);
const auto of_maps = cp.output.Feature().v;
const auto of_maps_per_group = of_maps / cp.groups;
@ -196,6 +200,9 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p, const option
JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolution_params& params,
const DispatchData& dispatchData) const {
const convolution_params& cp = static_cast<const convolution_params&>(params);
const auto& sub_group_size = GetSubGroupSize(cp);
const auto of_maps = params.output.Feature().v;
const auto of_maps_per_group = of_maps / params.groups;
const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size);
@ -209,7 +216,7 @@ JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolut
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
}
jit.AddConstant(MakeJitConstant("OSV_SIZE", 16));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));

View File

@ -4,6 +4,7 @@
#pragma once
#include "api/cldnn/runtime/device_info.hpp"
#include "convolution_kernel_base.h"
#include <string>
#include <vector>
@ -34,6 +35,16 @@ protected:
bool Validate(const Params& p, const optional_params& o) const override;
bool NeedPaddedInput() const override { return true; }
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
size_t GetSubGroupSize(const convolution_params& params) const {
if (params.engineInfo.computeUnitsCount <= 24) {
// Smaller # EU tends to be computation bounds.
// In such case, using larger worksize will result in larger computational inefficiency
// w.r.t the unalined output feature
return (params.output.Feature().v > 8) ? 16 : 8;
} else {
return 16;
}
}
private:
struct AutoTuneOption {

View File

@ -83,12 +83,11 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
uint fmg = feature_idx / SUB_GROUP_SIZE;
const uint g = split_idx;
#endif
UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
UNIT_TYPE w[PREFETCH];
uint in_addr;
uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * OSV_SIZE + lid;
#if GROUPED
weight_addr += g * FILTER_GROUPS_PITCH;
@ -156,7 +155,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
in_addr += INPUT0_FEATURE_PITCH;
for(int pf=0; pf<PREFETCH; pf++) {
w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
w[pf] = weights[weight_addr]; weight_addr += OSV_SIZE;
}
uint wi = 0;
@ -182,12 +181,12 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
}
}
w[wi % PREFETCH] = weights[weight_addr];
weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
weight_addr += OSV_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
wi++;
});
});
// addr went beyond due to prefetch so move it back to correct location.
weight_addr -= PREFETCH * SUB_GROUP_SIZE;
weight_addr -= PREFETCH * OSV_SIZE;
}
uint out_split_offset = g * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;