[IE CLDNN] Improve bfyx convolution performance for shallow output ch (#6399)
This commit is contained in:
parent
afe60b3263
commit
5d6fa64621
@ -9,7 +9,6 @@
|
||||
|
||||
namespace kernel_selector {
|
||||
// Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
|
||||
constexpr size_t sub_group_size = 16;
|
||||
|
||||
ConvolutionKernel_bfyx_os_iyx_osv16::ConvolutionKernel_bfyx_os_iyx_osv16()
|
||||
: ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16") {
|
||||
@ -94,6 +93,9 @@ static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_
|
||||
|
||||
block_x -= unused_x / simds_x;
|
||||
block_y -= unused_y / simds_y;
|
||||
|
||||
block_x = Align(block_x, 2);
|
||||
block_y = Align(block_y, 2);
|
||||
}
|
||||
|
||||
ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(
|
||||
@ -107,9 +109,11 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
|
||||
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
const auto& sub_group_size = GetSubGroupSize(cp);
|
||||
|
||||
if (cp.stride.x == 1 && cp.stride.y == 1) {
|
||||
if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
|
||||
option.blockWidth = 16;
|
||||
option.blockWidth = sub_group_size;
|
||||
option.blockHeight = 1;
|
||||
option.prefetch = 4;
|
||||
// if less than 16 values is required to compute one single row of output
|
||||
@ -143,13 +147,13 @@ ConvolutionKernel_bfyx_os_iyx_osv16::AutoTuneOption ConvolutionKernel_bfyx_os_iy
|
||||
if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
|
||||
shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
|
||||
}
|
||||
|
||||
return option;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16::SetDefault(const convolution_params& cp,
|
||||
int autoTuneIndex) const {
|
||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
|
||||
const auto& sub_group_size = GetSubGroupSize(cp);
|
||||
|
||||
const auto of_maps = cp.output.Feature().v;
|
||||
const auto of_maps_per_group = of_maps / cp.groups;
|
||||
@ -196,6 +200,9 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p, const option
|
||||
|
||||
JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& dispatchData) const {
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(params);
|
||||
const auto& sub_group_size = GetSubGroupSize(cp);
|
||||
|
||||
const auto of_maps = params.output.Feature().v;
|
||||
const auto of_maps_per_group = of_maps / params.groups;
|
||||
const size_t of_threads_per_batch = RoundUp(of_maps_per_group, sub_group_size);
|
||||
@ -209,7 +216,7 @@ JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolut
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
|
||||
|
||||
jit.AddConstant(MakeJitConstant("OSV_SIZE", 16));
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[2]));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "api/cldnn/runtime/device_info.hpp"
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@ -34,6 +35,16 @@ protected:
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
bool NeedPaddedInput() const override { return true; }
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
size_t GetSubGroupSize(const convolution_params& params) const {
|
||||
if (params.engineInfo.computeUnitsCount <= 24) {
|
||||
// Smaller # EU tends to be computation bounds.
|
||||
// In such case, using larger worksize will result in larger computational inefficiency
|
||||
// w.r.t the unalined output feature
|
||||
return (params.output.Feature().v > 8) ? 16 : 8;
|
||||
} else {
|
||||
return 16;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct AutoTuneOption {
|
||||
|
@ -83,12 +83,11 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
|
||||
uint fmg = feature_idx / SUB_GROUP_SIZE;
|
||||
const uint g = split_idx;
|
||||
#endif
|
||||
|
||||
UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
|
||||
UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
|
||||
UNIT_TYPE w[PREFETCH];
|
||||
uint in_addr;
|
||||
uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
|
||||
uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * OSV_SIZE + lid;
|
||||
|
||||
#if GROUPED
|
||||
weight_addr += g * FILTER_GROUPS_PITCH;
|
||||
@ -156,7 +155,7 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
|
||||
in_addr += INPUT0_FEATURE_PITCH;
|
||||
|
||||
for(int pf=0; pf<PREFETCH; pf++) {
|
||||
w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
|
||||
w[pf] = weights[weight_addr]; weight_addr += OSV_SIZE;
|
||||
}
|
||||
|
||||
uint wi = 0;
|
||||
@ -182,12 +181,12 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
|
||||
}
|
||||
}
|
||||
w[wi % PREFETCH] = weights[weight_addr];
|
||||
weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
|
||||
weight_addr += OSV_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
|
||||
wi++;
|
||||
});
|
||||
});
|
||||
// addr went beyond due to prefetch so move it back to correct location.
|
||||
weight_addr -= PREFETCH * SUB_GROUP_SIZE;
|
||||
weight_addr -= PREFETCH * OSV_SIZE;
|
||||
}
|
||||
|
||||
uint out_split_offset = g * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
|
||||
|
Loading…
Reference in New Issue
Block a user