[GPU] Deleted OpenCL max WGS limitation (#7107)
This commit is contained in:
parent
e034a072ea
commit
2079563e4d
@ -231,6 +231,7 @@ inline bool SimpleLayout(DataLayout l) {
|
||||
case DataLayout::yxfb:
|
||||
case DataLayout::byxf:
|
||||
case DataLayout::fyxb:
|
||||
case DataLayout::bfxy:
|
||||
case DataLayout::bfzyx:
|
||||
case DataLayout::bfwzyx:
|
||||
return true;
|
||||
@ -267,7 +268,7 @@ inline bool IsDynamicLSTMType(WeightsLayout l) {
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Tensor Exaplnation
|
||||
// Tensor Explanation
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// resource - 80x80
|
||||
|
@ -11,12 +11,17 @@ namespace kernel_selector {
|
||||
|
||||
ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activation_params& arg) const {
|
||||
const auto& out = arg.output;
|
||||
auto in_layout = arg.inputs[0].GetLayout();
|
||||
auto out_layout = arg.output.GetLayout();
|
||||
|
||||
DispatchData dispatchData;
|
||||
if (out.GetLayout() == DataLayout::yxfb) {
|
||||
if (out_layout == DataLayout::yxfb) {
|
||||
dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
|
||||
} else if (out.GetLayout() == DataLayout::b_fs_yx_fsv16 || out.GetLayout() == DataLayout::b_fs_yx_fsv32) {
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH},
|
||||
{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y}};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
} else if (out_layout == DataLayout::b_fs_yx_fsv16 || out_layout == DataLayout::b_fs_yx_fsv32) {
|
||||
dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v};
|
||||
dispatchData.lws = {16, 1, 1};
|
||||
} else if (out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) {
|
||||
@ -24,7 +29,10 @@ ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activa
|
||||
dispatchData.lws = {1, 16, 16};
|
||||
} else {
|
||||
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
|
||||
return dispatchData;
|
||||
|
@ -80,6 +80,7 @@ KernelsData ArgMaxMinKernelAxis::GetKernelsData(const Params& params, const opti
|
||||
size_t sort_size = orgParams.argMaxMinSortType == ArgMaxMinSortType::VALUE ? getSortSize(orgParams) : 1;
|
||||
|
||||
DispatchData dispatchData;
|
||||
|
||||
dispatchData.gws = { Align(getOperationNumber(orgParams), 32), sort_size, 1 };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
|
||||
|
@ -34,8 +34,15 @@ CommonDispatchData BatchToSpaceKernelBase::SetDefault(const batch_to_space_param
|
||||
dispatchData.gws = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v };
|
||||
dispatchData.lws = { 1, 16, 1 };
|
||||
} else {
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = out.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y,
|
||||
Tensor::DataChannelName::Z, Tensor::DataChannelName::W }};
|
||||
|
||||
dispatchData.gws = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
|
||||
return dispatchData;
|
||||
|
@ -68,16 +68,25 @@ bool BinaryConvolutionKernelBase::CheckWorkGroups(const BinaryConvolutionKernelB
|
||||
BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelBase::SetDefault(const binary_convolution_params& params,
|
||||
int) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& out = params.output;
|
||||
std::vector<size_t> global;
|
||||
if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) {
|
||||
if (out_layout == DataLayout::bfyx || out_layout == DataLayout::byxf) {
|
||||
global = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else {
|
||||
global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH},
|
||||
{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y}};
|
||||
}
|
||||
|
||||
auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
|
||||
auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
dispatchData.gws = global;
|
||||
dispatchData.lws = local;
|
||||
|
@ -22,9 +22,14 @@ BorderKernelBase::DispatchData BorderKernelBase::SetDefault(const border_params&
|
||||
const auto& output = params.output;
|
||||
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::W },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -19,9 +19,14 @@ BroadcastKernelBase::DispatchData BroadcastKernelBase::SetDefault(const broadcas
|
||||
const auto& output = params.output;
|
||||
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.gws = { output.X().v, output.Y().v * output.Z().v, output.Batch().v * output.Feature().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -86,7 +86,14 @@ ConcatenationKernelBase::DispatchData ConcatenationKernel_simple_Ref::SetDefault
|
||||
dispatchData.gws = { input.X().v * input.Y().v,
|
||||
input.Z().v * input.W().v,
|
||||
input.Feature().v * input.Batch().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::Z, Tensor::DataChannelName::W },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ namespace kernel_selector {
|
||||
|
||||
Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Convolution_kernel_b_fs_yx_fsv16_imad_1x1()
|
||||
: ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_1x1") {
|
||||
// TODO: can be potentially improved for GPUs with support of LWS > 256
|
||||
constexpr size_t max_block_elements = 32;
|
||||
for (size_t bs = 1; bs <= 2 * simd; ++bs) {
|
||||
for (size_t bf = 1; bf <= 4; ++bf) {
|
||||
|
@ -15,6 +15,7 @@ ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::ConvolutionKernel_b_fs_yx_fsv_16_32
|
||||
std::vector<size_t> simd_sizes = { 8, 16 };
|
||||
std::vector<std::string> exe_modes = ConvolutionKernelBase::autoTuneOptions;
|
||||
|
||||
// TODO: can be potentially improved for GPUs with support of LWS > 256
|
||||
constexpr size_t max_block_size = 32 * 8;
|
||||
constexpr size_t max_lws_size = 256;
|
||||
|
||||
|
@ -146,17 +146,29 @@ bool ConvolutionKernelBase::CheckPitchForSplitOnly(const convolution_params& par
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernelBase::SetDefault(const convolution_params& params, int) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& out = params.output;
|
||||
if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) {
|
||||
if (out_layout == DataLayout::bfyx || out_layout == DataLayout::byxf) {
|
||||
dispatchData.gws = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v};
|
||||
} else if (params.output.GetLayout() == DataLayout::bfzyx) {
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else if (out_layout == DataLayout::bfzyx) {
|
||||
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else {
|
||||
dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH},
|
||||
{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y}};
|
||||
}
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
dispatchData.cldnnStyle.blockWidth = 1;
|
||||
dispatchData.cldnnStyle.blockHeight = 1;
|
||||
|
@ -351,7 +351,13 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_b_fs_yx_fsv4_dw::SetD
|
||||
if (autoTuneParam.tiled) {
|
||||
dispatchData.lws[0] = autoTuneParam.tiled_simd;
|
||||
} else {
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
|
||||
dispatchData.gemmStyle = { 0, 0, 0, 0, 0, 0 };
|
||||
|
@ -66,9 +66,14 @@ bool ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::Validate(const Params& p, const op
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::SetDefault(const convolution_params& cp,
|
||||
int /*autoTuneIndex*/) const {
|
||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
|
||||
auto in_layout = cp.inputs[0].GetLayout();
|
||||
auto out_layout = cp.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.gws = { cp.output.Feature().v, cp.output.X().v * cp.output.Y().v, cp.output.Batch().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, cp.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, cp.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -99,8 +99,14 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_Ref::SetDefault(const conv
|
||||
// Just set the correct value for a particular implementation here,
|
||||
// until the whole hierarchy is re-written.
|
||||
const auto& out = params.output;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
|
||||
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
|
@ -72,10 +72,16 @@ JitConstants CumSumKernelBase::GetJitConstants(const cum_sum_params& params, Dis
|
||||
|
||||
CumSumKernelBase::DispatchData CumSumKernelBase::SetDefault(const cum_sum_params& params) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::W, Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v * params.output.W().v,
|
||||
params.output.Z().v * params.output.Y().v * params.output.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -51,6 +51,11 @@ WeightsLayout DeconvolutionKernel_imad_ref::GetPreferredWeightsLayout(const deco
|
||||
|
||||
DeconvolutionKernelBase::DispatchData DeconvolutionKernel_imad_ref::SetDefault(const deconvolution_params& params) const {
|
||||
DispatchData dispatchData = Parent::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.gws = {
|
||||
params.output.Feature().v,
|
||||
@ -58,7 +63,7 @@ DeconvolutionKernelBase::DispatchData DeconvolutionKernel_imad_ref::SetDefault(c
|
||||
params.output.Batch().v
|
||||
};
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -30,13 +30,18 @@ ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const {
|
||||
|
||||
CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Z().v * params.output.Y().v * params.output.X().v };
|
||||
|
||||
// this kernel only supports bfyx and b_fs_yx_fsv16 layout.
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes({1, dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
|
@ -258,7 +258,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
|
||||
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
|
||||
auto& kernel = kd.kernels[i];
|
||||
KernelBase::CheckDispatchData(kernelName, dispatchData);
|
||||
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
|
||||
kernel.params.workGroups.global = dispatchData.gws;
|
||||
kernel.params.workGroups.local = dispatchData.lws;
|
||||
kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo);
|
||||
|
@ -268,12 +268,17 @@ JitConstants EltwiseKernel_b_fs_yx_fsv4::GetJitConstants(const eltwise_params& p
|
||||
|
||||
EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv4::SetDefault(const eltwise_params& params) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE},
|
||||
{Tensor::DataChannelName::BATCH}};
|
||||
|
||||
dispatchData.gws[0] = params.output.X().v * params.output.Y().v;
|
||||
dispatchData.gws[1] = CeilDiv(params.output.Feature().v, 4);
|
||||
dispatchData.gws[2] = params.output.Batch().v;
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
dispatchData.lws[1] = 1;
|
||||
dispatchData.lws[2] = 1;
|
||||
|
||||
|
@ -585,7 +585,9 @@ EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_para
|
||||
|
||||
auto local = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo);
|
||||
|
||||
const size_t optimal_lws_values[] = {256, 224, 192, 160, 128, 96, 64, 32, 16};
|
||||
// TODO: can be potentially improved for GPUs with support of LWS > 256
|
||||
const size_t optimal_lws_values[] = { 256, 224, 192, 160, 128, 96, 64, 32, 16 };
|
||||
|
||||
if ((params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 ||
|
||||
params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
|
||||
params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 ||
|
||||
|
@ -32,11 +32,16 @@ JitConstants EmbeddingBagKernelRef::GetJitConstants(const embedding_bag_params&
|
||||
|
||||
CommonDispatchData EmbeddingBagKernelRef::SetDefault(const embedding_bag_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Y().v * params.output.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
@ -84,8 +89,6 @@ ParamsKey EmbeddingBagKernelRef::GetSupportedKey() const {
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
|
||||
k.EnableInputLayout(DataLayout::bfxy);
|
||||
|
||||
k.EnableAllInputLayout();
|
||||
k.EnableAllOutputLayout();
|
||||
k.EnableTensorOffset();
|
||||
|
@ -42,11 +42,16 @@ JitConstants ExtractImagePatchesKernelBase::GetJitConstants(const extract_image_
|
||||
|
||||
ExtractImagePatchesKernelBase::DispatchData ExtractImagePatchesKernelBase::SetDefault(const extract_image_patches_params& params) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Y().v * params.output.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -90,9 +90,16 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get
|
||||
bool slm_div_factor_exception = input_batch == 300 && input_feature == 2048 &&
|
||||
output_batch == 300 && (output_feature == 324 || output_feature == 81);
|
||||
|
||||
bool big_wgs_exception = params.engineInfo.computeUnitsCount == 96 && params.engineInfo.maxThreadsPerExecutionUnit == 7 &&
|
||||
input_feature == 9216 && output_feature == 4096;
|
||||
|
||||
size_t max_work_group_size = params.engineInfo.maxWorkGroupSize;
|
||||
if (max_work_group_size > 256 && !big_wgs_exception)
|
||||
max_work_group_size = 256;
|
||||
|
||||
if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception)
|
||||
while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
|
||||
(tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size))
|
||||
(tuning_data.slm_div_factor * 2 <= max_work_group_size / tuning_data.sub_group_size))
|
||||
tuning_data.slm_div_factor *= 2;
|
||||
|
||||
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||
|
@ -207,18 +207,26 @@ fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::Set
|
||||
const fused_conv_eltwise_params& params,
|
||||
int) const {
|
||||
DispatchData dispatchData;
|
||||
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
const auto& out = params.output;
|
||||
|
||||
if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf ||
|
||||
params.output.GetLayout() == DataLayout::bfzyx || params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
|
||||
params.output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
|
||||
dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v};
|
||||
if (out_layout == DataLayout::bfyx || out_layout == DataLayout::byxf ||
|
||||
out_layout == DataLayout::bfzyx || out_layout == DataLayout::b_fs_zyx_fsv16 ||
|
||||
out_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) {
|
||||
dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v };
|
||||
dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
} else {
|
||||
dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v * out.Z().v };
|
||||
dispatchData.gws = { out.Feature().v * out.Batch().v, out.X().v, out.Y().v * out.Z().v };
|
||||
dims_by_gws = {{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }};
|
||||
}
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
dispatchData.cldnnStyle.blockWidth = 1;
|
||||
dispatchData.cldnnStyle.blockHeight = 1;
|
||||
|
@ -69,20 +69,35 @@ static inline std::vector<std::string> GetDefaultOrder(size_t size) {
|
||||
|
||||
CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_params& params, const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& output = params.output;
|
||||
|
||||
switch (params.inputs[1].GetLayout()) {
|
||||
case DataLayout::bfyx:
|
||||
dispatchData.gws = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
|
||||
break;
|
||||
|
||||
case DataLayout::bfzyx:
|
||||
dispatchData.gws = {output.X().v, output.Y().v * output.Z().v, output.Feature().v * output.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
|
||||
break;
|
||||
|
||||
case DataLayout::bfwzyx:
|
||||
dispatchData.gws = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -90,7 +105,7 @@ CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_par
|
||||
break;
|
||||
}
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -155,16 +155,28 @@ static std::string GetIndecesIdxOrder(const gather_params& params, size_t axis,
|
||||
CommonDispatchData GatherKernelRef::SetDefault(const gather_params& params, const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
const auto& output = params.output;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
if (output.GetLayout() == DataLayout::bfyx) {
|
||||
if (out_layout == DataLayout::bfyx) {
|
||||
dispatchData.gws = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v};
|
||||
} else if (output.GetLayout() == DataLayout::bfzyx) {
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else if (out_layout == DataLayout::bfzyx) {
|
||||
dispatchData.gws = {output.X().v, output.Y().v * output.Z().v, output.Feature().v * output.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
} else {
|
||||
dispatchData.gws = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
}
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -16,10 +16,15 @@ JitConstants GRNKernelBase::GetJitConstants(const grn_params& params, GRNKernelB
|
||||
|
||||
GRNKernelBase::DispatchData GRNKernelBase::SetDefault(const grn_params& params) const {
|
||||
const auto& output = params.output;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::X }};
|
||||
|
||||
DispatchData dispatchData;
|
||||
dispatchData.gws = { output.Batch().v, output.Y().v, output.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -34,6 +34,11 @@ ParamsKey LRNKernelAcrossChannelMultipleFeaturesFSV16::GetSupportedKey() const {
|
||||
|
||||
CommonDispatchData LRNKernelAcrossChannelMultipleFeaturesFSV16::SetDefault(const lrn_params& params) const {
|
||||
CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
const auto& out = params.output;
|
||||
const unsigned int alignment = 16;
|
||||
@ -41,7 +46,7 @@ CommonDispatchData LRNKernelAcrossChannelMultipleFeaturesFSV16::SetDefault(const
|
||||
dispatchData.gws = { Align(out.Feature().v, alignment),
|
||||
out.X().v,
|
||||
out.Y().v * out.Batch().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -61,11 +61,16 @@ JitConstants LRNKernelRef::GetJitConstants(const lrn_params& params, const LRNKe
|
||||
|
||||
LRNKernelRef::Parent::DispatchData LRNKernelRef::SetDefault(const lrn_params& params) const {
|
||||
DispatchData dispatchData = Parent::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::BATCH }};
|
||||
|
||||
const auto& out = params.output;
|
||||
|
||||
dispatchData.gws = { out.X().v * out.Y().v, out.Feature().v, out.Batch().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -58,11 +58,16 @@ JitConstants LRNKernelWithinChannelByxfOpt::GetJitConstants(const lrn_params& pa
|
||||
LRNKernelWithinChannelByxfOpt::Parent::DispatchData LRNKernelWithinChannelByxfOpt::SetDefault(
|
||||
const lrn_params& params) const {
|
||||
DispatchData dispatchData = Parent::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::BATCH }};
|
||||
|
||||
const auto& out = params.output;
|
||||
|
||||
dispatchData.gws = { out.X().v * out.Y().v, CeilDiv(out.Feature().v, 8), out.Batch().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -68,11 +68,17 @@ KernelsData LSTM_DynamicInputKernelBfyxOpt::GetKernelsData(const Params& params,
|
||||
KernelData kd = KernelData::Default<lstm_dynamic_input_params>(params);
|
||||
lstm_dynamic_input_params& dlstm_params = *static_cast<lstm_dynamic_input_params*>(kd.params.get());
|
||||
|
||||
auto in_layout = dlstm_params.inputs[0].GetLayout();
|
||||
auto out_layout = dlstm_params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE }};
|
||||
|
||||
const auto& out = dlstm_params.output;
|
||||
auto hidden_size = out.X().v;
|
||||
|
||||
dispatchData.gws = { hidden_size / simd_size, out.Batch().v * out.Y().v, out.Feature().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
bool succeed = UpdateWeightsParams(dlstm_params,
|
||||
options,
|
||||
|
@ -26,11 +26,17 @@ JitConstants LSTM_DynamicInputKernelBase::GetJitConstants(const lstm_dynamic_inp
|
||||
LSTM_DynamicInputKernelBase::DispatchData LSTM_DynamicInputKernelBase::SetDefault(
|
||||
const lstm_dynamic_input_params& params) {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE }};
|
||||
|
||||
const auto& out = params.output;
|
||||
|
||||
// 4 * hidden, batch * dir, seq_len
|
||||
dispatchData.gws = { out.X().v, out.Batch().v * out.Y().v, out.Feature().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -282,7 +282,7 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params, con
|
||||
|
||||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
|
||||
auto& kernel = kd.kernels[i];
|
||||
KernelBase::CheckDispatchData(kernelName, dispatchData);
|
||||
KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize);
|
||||
kernel.params.workGroups.global = dispatchData.gws;
|
||||
kernel.params.workGroups.local = dispatchData.lws;
|
||||
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
|
||||
|
@ -22,14 +22,23 @@ JitConstants OneHotKernelBase::GetJitConstants(const one_hot_params& params) con
|
||||
|
||||
OneHotKernelBase::DispatchData OneHotKernelBase::SetDefault(const one_hot_params& params) {
|
||||
const auto& input = params.inputs[0];
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
DispatchData dispatchData;
|
||||
if (params.output.GetDims().size() == 5) {
|
||||
dispatchData.gws = { input.Batch().v, input.Feature().v * input.Z().v, input.Y().v * input.X().v };
|
||||
dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::Z, Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
} else {
|
||||
dispatchData.gws = { input.Batch().v, input.Feature().v, input.Y().v * input.X().v };
|
||||
dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
}
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -31,10 +31,16 @@ ParamsKey PermuteKernelRef::GetSupportedKey() const {
|
||||
|
||||
CommonDispatchData PermuteKernelRef::SetDefault(const permute_params& params) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
|
||||
const auto& in = params.inputs[0];
|
||||
|
||||
dispatchData.gws = {in.X().v, in.Y().v * in.Z().v * in.W().v, in.Feature().v * in.Batch().v};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -199,9 +199,10 @@ static std::vector<size_t> GetBestLwsFromGws(const permute_params& params, const
|
||||
std::vector<size_t> dims{0, 2, 1};
|
||||
|
||||
// SLM size: elemsize * tile_size * tile_size * work_items <= 64K
|
||||
size_t elem_size = params.output.ElementSize();
|
||||
size_t max_local_mem_size = params.engineInfo.maxLocalMemSize;
|
||||
size_t max_num_work_items = std::min((size_t)256, (size_t)max_local_mem_size / (elem_size * tile_size * tile_size));
|
||||
const size_t elem_size = params.output.ElementSize();
|
||||
const size_t max_local_mem_size = params.engineInfo.maxLocalMemSize;
|
||||
const size_t max_work_group_size = params.engineInfo.maxWorkGroupSize;
|
||||
size_t max_num_work_items = std::min(max_work_group_size, max_local_mem_size / (elem_size * tile_size * tile_size));
|
||||
|
||||
for (size_t i = 0; i < dims.size(); ++i) {
|
||||
size_t dim = dims[i];
|
||||
|
@ -33,12 +33,17 @@ ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const {
|
||||
|
||||
PoolingKernelBase::DispatchData PoolingKerneGPU_b_fs_yx_fsv4::SetDefault(const pooling_params& params) const {
|
||||
DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
|
||||
dispatchData.gws[0] = params.output.X().v; // X
|
||||
dispatchData.gws[1] = params.output.Y().v; // Y
|
||||
// we got b_fs_yx_fsv4 format, we process 4 features per workitem
|
||||
dispatchData.gws[2] = CeilDiv(params.output.Feature().v, 4) * params.output.Batch().v;
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -36,6 +36,9 @@ ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
|
||||
|
||||
PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const {
|
||||
DispatchData dispatchData = PoolingKernelBase::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& out = params.output;
|
||||
auto x = out.X().v;
|
||||
@ -61,7 +64,10 @@ PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault
|
||||
// we got b_fs_yx_fsv16 format, we process 16 features per workitem
|
||||
dispatchData.gws[2] = CeilDiv(f, FEATURE_SLICE_SIZE) * b;
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -33,13 +33,18 @@ ParamsKey PyramidROIAlignKernelRef::GetSupportedKey() const {
|
||||
|
||||
PyramidROIAlignKernelBase::DispatchData PyramidROIAlignKernelRef::SetDefault(const PyramidROIAlign_params& params) const {
|
||||
auto dispatchData = PyramidROIAlignKernelBase::SetDefault(params);
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.gws = {
|
||||
params.output.X().v * params.output.Y().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Batch().v };
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -32,11 +32,16 @@ ParamsKey ReduceKernelRef::GetSupportedKey() const {
|
||||
|
||||
CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::Z, Tensor::DataChannelName::W },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
dispatchData.gws = { params.output.X().v * params.output.Y().v,
|
||||
params.output.Z().v * params.output.W().v,
|
||||
params.output.Batch().v * params.output.Feature().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -50,11 +50,16 @@ JitConstants ReorderKernelBinary::GetJitConstants(const reorder_params& params)
|
||||
|
||||
ReorderKernelBinary::DispatchData ReorderKernelBinary::SetDefault(const reorder_params& params) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
|
||||
dispatchData.gws = { input.Batch().v, CeilDiv(input.Feature().v, 32), input.Y().v * input.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -33,14 +33,23 @@ JitConstants ReorgYoloKernelRef::GetJitConstants(const reorg_yolo_params& ry) co
|
||||
}
|
||||
ReorgYoloKernelRef::DispatchData SetDefault(const reorg_yolo_params& params) {
|
||||
ReorgYoloKernelRef::DispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
if (input.GetLayout() == DataLayout::bfyx) {
|
||||
dispatchData.gws = {input.X().v, input.Y().v, input.Feature().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE}};
|
||||
} else {
|
||||
dispatchData.gws = {input.Feature().v * input.Batch().v, input.X().v, input.Y().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH},
|
||||
{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y}};
|
||||
}
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -49,18 +49,35 @@ size_t ResampleKernelBase::GetFeatureBlockSize(const resample_params& params) co
|
||||
|
||||
ResampleKernelBase::DispatchData ResampleKernelBase::SetDefault(const kernel_selector::resample_params &arg) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = arg.inputs[0].GetLayout();
|
||||
auto out_layout = arg.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& out = arg.output;
|
||||
|
||||
if (arg.resampleType == ResampleType::NEAREST_NEIGHBOR)
|
||||
if (arg.resampleType == ResampleType::NEAREST_NEIGHBOR) {
|
||||
dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v };
|
||||
else if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX)
|
||||
dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
} else if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX) {
|
||||
dispatchData.gws = { Align(out.X().v, 32), out.Y().v, out.Batch().v };
|
||||
else if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP)
|
||||
dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::BATCH }};
|
||||
} else if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) {
|
||||
dispatchData.gws = { out.X().v * out.Y().v, CeilDiv(out.Feature().v, GetFeatureBlockSize(arg)), out.Batch().v * out.Z().v };
|
||||
else
|
||||
dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::Z, Tensor::DataChannelName::BATCH }};
|
||||
} else {
|
||||
dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v };
|
||||
dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
}
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX) {
|
||||
dispatchData.lws[0] = 32;
|
||||
|
@ -62,6 +62,10 @@ ParamsKey ResampleKernelOpt::GetSupportedKey() const {
|
||||
|
||||
ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_selector::resample_params &arg) const {
|
||||
DispatchData dispatchData;
|
||||
auto in_layout = arg.inputs[0].GetLayout();
|
||||
auto out_layout = arg.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& out = arg.output;
|
||||
|
||||
if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) {
|
||||
@ -69,7 +73,10 @@ ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_sele
|
||||
dispatchData.gws[1] = CeilDiv(out.Feature().v, GetFeatureBlockSize(arg));
|
||||
dispatchData.gws[2] = arg.output.Batch().v;
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE},
|
||||
{Tensor::DataChannelName::BATCH}};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
} else {
|
||||
auto opt_x_block_size = GetOptimalBlockSize(arg);
|
||||
if (out.X().v > 32 && opt_x_block_size == 1) {
|
||||
|
@ -116,11 +116,16 @@ JitConstants ResampleKernelRef::GetJitConstants(const resample_params& params) c
|
||||
|
||||
ResampleKernelBase::DispatchData ResampleKernelRef::SetDefault(const resample_params& arg) const {
|
||||
auto dispatchData = Parent::SetDefault(arg);
|
||||
auto in_layout = arg.inputs[0].GetLayout();
|
||||
auto out_layout = arg.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X },
|
||||
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::Z },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
if (use_packing(arg)) {
|
||||
auto pack = packing_factor(arg);
|
||||
dispatchData.gws = { arg.output.X().v, arg.output.Y().v * arg.output.Z().v, CeilDiv(arg.output.Feature().v, pack) * arg.output.Batch().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
|
||||
return dispatchData;
|
||||
|
@ -32,12 +32,17 @@ ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const {
|
||||
CommonDispatchData ReverseSequenceKernelRef::SetDefault(const reverse_sequence_params& params,
|
||||
const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Y().v * params.output.X().v };
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -70,6 +70,10 @@ static inline std::vector<std::string> GetDefaultOrder(size_t size) {
|
||||
|
||||
CommonDispatchData ScatterElementsUpdateKernelRef::SetDefault(const scatter_elements_update_params& params, const optional_params&, bool is_second) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
const auto& output = params.output;
|
||||
const auto& indices = params.inputs[1];
|
||||
|
||||
@ -78,21 +82,30 @@ CommonDispatchData ScatterElementsUpdateKernelRef::SetDefault(const scatter_elem
|
||||
switch (params.inputs[0].GetLayout()) {
|
||||
case DataLayout::bfyx:
|
||||
dispatchData.gws = {scope.X().v, scope.Y().v, scope.Feature().v * scope.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X},
|
||||
{Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
break;
|
||||
|
||||
case DataLayout::bfzyx:
|
||||
dispatchData.gws = {scope.X().v * scope.Y().v, scope.Z().v, scope.Feature().v * scope.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::Z},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
break;
|
||||
|
||||
case DataLayout::bfwzyx:
|
||||
dispatchData.gws = {scope.X().v * scope.Y().v, scope.Z().v * scope.W().v, scope.Feature().v * scope.Batch().v};
|
||||
dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
|
||||
{Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
|
||||
{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
|
||||
break;
|
||||
default:
|
||||
throw std::invalid_argument("Unsupported data layout for scatter elements update primitive");
|
||||
break;
|
||||
}
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -44,11 +44,16 @@ bool ShuffleChannelsKernelRef::Validate(const Params& p, const optional_params&
|
||||
CommonDispatchData ShuffleChannelsKernelRef::SetDefault(const shuffle_channels_params& params,
|
||||
const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Y().v * params.output.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -28,14 +28,21 @@ bool SpaceToBatchKernelBase::Validate(const Params& p, const optional_params& o)
|
||||
|
||||
CommonDispatchData SpaceToBatchKernelBase::SetDefault(const space_to_batch_params& params, const optional_params&) const {
|
||||
const auto& out = params.output;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
|
||||
|
||||
CommonDispatchData dispatchData;
|
||||
if (out.GetLayout() == DataLayout::b_fs_yx_fsv16 && out.Feature().v % 16 == 0) {
|
||||
dispatchData.gws = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v };
|
||||
dispatchData.lws = {1, 16, 1};
|
||||
dispatchData.lws = { 1, 16, 1 };
|
||||
} else {
|
||||
dispatchData.gws = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y,
|
||||
Tensor::DataChannelName::Z, Tensor::DataChannelName::W }};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
}
|
||||
|
||||
return dispatchData;
|
||||
|
@ -48,11 +48,16 @@ bool SpaceToDepthKernelRef::Validate(const Params& p, const optional_params& o)
|
||||
CommonDispatchData SpaceToDepthKernelRef::SetDefault(const space_to_depth_params& params,
|
||||
const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }};
|
||||
|
||||
dispatchData.gws = { params.output.Batch().v,
|
||||
params.output.Feature().v,
|
||||
params.output.Z().v * params.output.Y().v * params.output.X().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -84,6 +84,11 @@ bool StridedSliceKernelRef::Validate(const Params& p, const optional_params& o)
|
||||
|
||||
CommonDispatchData StridedSliceKernelRef::SetDefault(const strided_slice_params& params, const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::BATCH },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }};
|
||||
|
||||
// If the new_axis_mask is set, then begin, end, and stride are ignored
|
||||
// and a new length 1 dimension is adding. Input data just copying to output
|
||||
@ -92,7 +97,7 @@ CommonDispatchData StridedSliceKernelRef::SetDefault(const strided_slice_params&
|
||||
params.output.Feature().v,
|
||||
params.output.Z().v * params.output.Y().v * params.output.X().v };
|
||||
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -34,11 +34,16 @@ ParamsKey TileKernelRef::GetSupportedKey() const {
|
||||
|
||||
CommonDispatchData TileKernelRef::SetDefault(const tile_params& params, const optional_params&) const {
|
||||
CommonDispatchData dispatchData;
|
||||
auto in_layout = params.inputs[0].GetLayout();
|
||||
auto out_layout = params.output.GetLayout();
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::Z, Tensor::DataChannelName::W },
|
||||
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
|
||||
|
||||
auto out = params.output;
|
||||
|
||||
dispatchData.gws = {out.X().v * out.Y().v, out.Z().v * out.W().v, out.Batch().v * out.Feature().v};
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
dispatchData.gws = { out.X().v * out.Y().v, out.Z().v * out.W().v, out.Batch().v * out.Feature().v };
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
|
||||
|
||||
return dispatchData;
|
||||
}
|
||||
|
@ -184,7 +184,7 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel,
|
||||
bool bias,
|
||||
int number_of_inputs,
|
||||
uint32_t number_of_inputs_for_fused_prims) const {
|
||||
KernelBase::CheckDispatchData(kernelMapName, dispatchData);
|
||||
KernelBase::CheckDispatchData(kernelMapName, dispatchData, engine_info.maxWorkGroupSize);
|
||||
kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode);
|
||||
kernel.params.workGroups.global = dispatchData.gws;
|
||||
kernel.params.workGroups.local = dispatchData.lws;
|
||||
|
@ -199,20 +199,213 @@ std::vector<size_t> GetTensorFriendlyWorkGroups(const DataTensor& t) {
|
||||
return sizes;
|
||||
}
|
||||
|
||||
std::vector<size_t> GetOptimalLocalWorkGroupSizes(std::vector<size_t> gws, const EngineInfo& info) {
|
||||
const size_t lws_max = info.maxWorkGroupSize;
|
||||
const size_t optimal_lws_values[] = {256, 227, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 2, 1};
|
||||
std::vector<size_t> GetOptimalLocalWorkGroupSizes(std::vector<size_t> gws, const EngineInfo& info,
|
||||
DataLayout input_layout, DataLayout output_layout,
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws) {
|
||||
enum axis { x, y, z, w, f, b, unused_axis };
|
||||
|
||||
// GWS/LWS priority order should be considered for better local WGS setting
|
||||
// and as a result more optimized data reading/writing inside kernels
|
||||
std::vector<size_t> priority_order = { 0, 1, 2 };
|
||||
std::vector<size_t> layout_order = { x, y, z, w, f, b };
|
||||
|
||||
const size_t gws_dims_num = priority_order.size();
|
||||
const size_t axis_num = layout_order.size();
|
||||
size_t first_axis_idx = 0;
|
||||
|
||||
std::vector<size_t> axis_by_gws = { unused_axis, unused_axis, unused_axis, unused_axis, unused_axis, unused_axis };
|
||||
for (size_t gws_idx = 0; gws_idx < gws_dims_num; gws_idx++) {
|
||||
for (size_t axis_idx = 0; axis_idx < dims_by_gws[gws_idx].size(); axis_idx++) {
|
||||
axis_by_gws[static_cast<size_t>(dims_by_gws[gws_idx][axis_idx])] = gws_idx;
|
||||
}
|
||||
}
|
||||
|
||||
auto calculate_optimized_priority_order = [&]() -> void {
|
||||
while (axis_by_gws[layout_order[first_axis_idx]] == unused_axis)
|
||||
first_axis_idx++;
|
||||
|
||||
for (size_t gws_idx = 0; gws_idx < gws_dims_num; gws_idx++) {
|
||||
for (size_t axis_idx = first_axis_idx; axis_idx < axis_num; axis_idx++) {
|
||||
if (axis_by_gws[layout_order[axis_idx]] != unused_axis) {
|
||||
bool is_already_exists = false;
|
||||
if (axis_idx > 0) {
|
||||
for (int i = axis_idx - 1; i >= 0; i--) {
|
||||
if (axis_by_gws[layout_order[axis_idx]] == axis_by_gws[layout_order[i]]) {
|
||||
is_already_exists = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
first_axis_idx++;
|
||||
if (!is_already_exists) {
|
||||
priority_order[gws_idx] = axis_by_gws[layout_order[axis_idx]];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto one_layout = input_layout == output_layout;
|
||||
|
||||
auto simple_planar_layout = Tensor::SimpleLayout(output_layout);
|
||||
|
||||
auto blocked_fsv_layout = output_layout == DataLayout::b_fs_yx_fsv4 || output_layout == DataLayout::fs_b_yx_fsv32 ||
|
||||
output_layout == DataLayout::b_fs_yx_fsv16 || output_layout == DataLayout::b_fs_zyx_fsv16 ||
|
||||
output_layout == DataLayout::b_fs_yx_fsv32 || output_layout == DataLayout::b_fs_zyx_fsv32;
|
||||
|
||||
auto blocked_bsv_fsv_layout = output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16;
|
||||
|
||||
auto try_change_priority_order = (simple_planar_layout || blocked_fsv_layout || blocked_bsv_fsv_layout) && one_layout;
|
||||
|
||||
if (try_change_priority_order) {
|
||||
if (simple_planar_layout) {
|
||||
switch (output_layout) {
|
||||
case DataLayout::bf:
|
||||
layout_order = { f, b, x, y, z, w };
|
||||
break;
|
||||
case DataLayout::fb:
|
||||
layout_order = { b, f, x, y, z, w };
|
||||
break;
|
||||
case DataLayout::bfyx:
|
||||
layout_order = { x, y, f, b, z, w };
|
||||
break;
|
||||
case DataLayout::yxfb:
|
||||
layout_order = { b, f, x, y, z, w };
|
||||
break;
|
||||
case DataLayout::byxf:
|
||||
layout_order = { f, x, y, b, z, w };
|
||||
break;
|
||||
case DataLayout::fyxb:
|
||||
layout_order = { b, x, y, f, z, w };
|
||||
break;
|
||||
case DataLayout::bfxy:
|
||||
layout_order = { y, x, f, b, z, w };
|
||||
break;
|
||||
case DataLayout::bfzyx:
|
||||
layout_order = { x, y, z, f, b, w };
|
||||
break;
|
||||
case DataLayout::bfwzyx:
|
||||
layout_order = { x, y, z, w, f, b };
|
||||
break;
|
||||
default:
|
||||
layout_order = { x, y, z, w, f, b };
|
||||
break;
|
||||
}
|
||||
} else if (blocked_fsv_layout) {
|
||||
if (output_layout == DataLayout::b_fs_yx_fsv4 || output_layout == DataLayout::b_fs_yx_fsv16 || output_layout == DataLayout::b_fs_yx_fsv32)
|
||||
layout_order = { f, x, y, b, z, w };
|
||||
else if (output_layout == DataLayout::b_fs_zyx_fsv16 || output_layout == DataLayout::b_fs_zyx_fsv32)
|
||||
layout_order = { f, x, y, z, b, w };
|
||||
else // output_layout == DataLayout::fs_b_yx_fsv32
|
||||
layout_order = { f, x, y, b, z, w };
|
||||
} else if (blocked_bsv_fsv_layout) {
|
||||
layout_order = { f, b, x, y, z, w };
|
||||
}
|
||||
|
||||
calculate_optimized_priority_order();
|
||||
|
||||
// Revert basic priority if something is wrong
|
||||
if (priority_order[0] == priority_order[1] || priority_order[0] == priority_order[2] || priority_order[1] == priority_order[2] ||
|
||||
priority_order[0] > 2 || priority_order[1] > 2 || priority_order[2] > 2) {
|
||||
priority_order = { 0, 1, 2 };
|
||||
}
|
||||
}
|
||||
|
||||
size_t lws_max = info.maxWorkGroupSize;
|
||||
const size_t optimal_lws_values[] = { 1024, 960, 896, 832, 768, 704, 640, 576,
|
||||
512, 480, 448, 416, 384, 352, 320, 288,
|
||||
256, 227, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 2, 1 };
|
||||
const size_t suboptimal_lws_values[] = { 1024, 960, 896, 832, 768, 704, 640, 576,
|
||||
512, 480, 448, 416, 384, 352, 320, 288,
|
||||
256, 227, 224, 192, 160, 128, 96, 64, 32, 16,
|
||||
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 };
|
||||
|
||||
size_t first_lws_idx = lws_max == 1024 ? 0:
|
||||
lws_max == 512 ? 8:
|
||||
16;
|
||||
// Reduces max local wgs for some cases on Gen12+ devices
|
||||
if (lws_max >= 512) {
|
||||
auto two_dims_are_odd_and_equal = (gws[0] % 2 && gws[0] > 7 && (gws[0] == gws[1] || gws[0] == gws[2])) ||
|
||||
(gws[1] % 2 && gws[1] > 7 && gws[1] == gws[2]);
|
||||
|
||||
// Known cases when lws_max = 256 works better than lws_max > 256
|
||||
auto max_wgs_exception1 = gws[priority_order[0]] == 1278 && gws[priority_order[1]] == 718 && gws[priority_order[2]] % 10 == 0;
|
||||
auto max_wgs_exception2 = gws[priority_order[0]] == 28 && gws[priority_order[1]] == 168 && gws[priority_order[2]] == 128;
|
||||
auto max_wgs_exception3 = gws[priority_order[0]] == 1000 && gws[priority_order[1]] == 1 && gws[priority_order[2]] == 64;
|
||||
auto max_wgs_exception4 = gws[priority_order[0]] == 180 && gws[priority_order[1]] == 320 && gws[priority_order[2]] == 56;
|
||||
auto max_wgs_exception5 = gws[priority_order[0]] == 1 && gws[priority_order[1]] > 256 && gws[priority_order[2]] == 1;
|
||||
auto max_wgs_exception6 = gws[priority_order[0]] == 64 && gws[priority_order[1]] == 16 && gws[priority_order[2]] == 1 &&
|
||||
priority_order[1] == 2 && priority_order[2] == 1;
|
||||
if (two_dims_are_odd_and_equal || max_wgs_exception1 || max_wgs_exception2 || max_wgs_exception3 || max_wgs_exception4 ||
|
||||
max_wgs_exception5 || max_wgs_exception6) {
|
||||
lws_max = 256;
|
||||
first_lws_idx = 16;
|
||||
}
|
||||
}
|
||||
|
||||
size_t total_lws = 1;
|
||||
std::vector<size_t> lws;
|
||||
size_t total_gws = 1;
|
||||
std::vector<size_t> lws = { 1, 1, 1 };
|
||||
|
||||
for (size_t i = 0; i < gws.size(); ++i) {
|
||||
auto rest_lws = lws_max / total_lws;
|
||||
size_t lws_idx = 0;
|
||||
while (rest_lws < optimal_lws_values[lws_idx]) lws_idx++;
|
||||
size_t lws_idx = first_lws_idx;
|
||||
size_t max_optimal_lws0_value = lws_max;
|
||||
if (try_change_priority_order && axis_by_gws[f] != unused_axis) {
|
||||
if (output_layout == DataLayout::b_fs_yx_fsv16 || output_layout == DataLayout::b_fs_zyx_fsv16 || output_layout == DataLayout::fs_b_yx_fsv32) {
|
||||
max_optimal_lws0_value = 16;
|
||||
} else if (output_layout == DataLayout::b_fs_yx_fsv32 || output_layout == DataLayout::b_fs_zyx_fsv32) {
|
||||
max_optimal_lws0_value = 32;
|
||||
} else if ((output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) &&
|
||||
(axis_by_gws[b] == axis_by_gws[f])) {
|
||||
max_optimal_lws0_value = 256;
|
||||
} else if ((output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) &&
|
||||
(axis_by_gws[b] != axis_by_gws[f]) && (axis_by_gws[b] != unused_axis)) {
|
||||
max_optimal_lws0_value = 16;
|
||||
}
|
||||
}
|
||||
|
||||
while (gws[i] % optimal_lws_values[lws_idx]) lws_idx++;
|
||||
auto can_use_suboptimal_lws1 = (i == 1) && ((gws[priority_order[0]] % 32 == 0) || (gws[priority_order[0]] == 1 && gws[priority_order[2]] % 16 != 0));
|
||||
auto can_use_suboptimal_lws2 = (i == 2) && (total_lws == total_gws);
|
||||
const size_t* lws_values = can_use_suboptimal_lws1 || can_use_suboptimal_lws2 ?
|
||||
suboptimal_lws_values :
|
||||
optimal_lws_values;
|
||||
|
||||
lws.push_back(optimal_lws_values[lws_idx]);
|
||||
total_lws *= optimal_lws_values[lws_idx];
|
||||
while (rest_lws < lws_values[lws_idx]) lws_idx++;
|
||||
if (i == 0) {
|
||||
while (lws_values[lws_idx] > max_optimal_lws0_value) lws_idx++;
|
||||
}
|
||||
while (gws[priority_order[i]] % lws_values[lws_idx]) lws_idx++;
|
||||
|
||||
if (lws_max == 256 || total_lws == total_gws) {
|
||||
lws[priority_order[i]] = lws_values[lws_idx];
|
||||
} else {
|
||||
lws[priority_order[i]] = i == 2 && gws[priority_order[0]] != 1 ? 1 : lws_values[lws_idx];
|
||||
if (total_gws > 100 && total_lws < 8 && i == 2)
|
||||
lws[priority_order[i]] = lws_values[lws_idx];
|
||||
}
|
||||
|
||||
total_lws *= lws_values[lws_idx];
|
||||
total_gws *= gws[priority_order[i]];
|
||||
}
|
||||
|
||||
// For cases with lws { 1, 1, 1 } try to use suboptimal values to increase work group size
|
||||
if (lws[0] == 1 && lws[1] == 1 && lws[2] == 1) {
|
||||
total_lws = 1;
|
||||
for (size_t i = 0; i < gws.size(); ++i) {
|
||||
auto rest_lws = lws_max / total_lws;
|
||||
size_t lws_idx = first_lws_idx;
|
||||
|
||||
const size_t* lws_values = suboptimal_lws_values;
|
||||
|
||||
while (rest_lws < lws_values[lws_idx]) lws_idx++;
|
||||
while (gws[priority_order[i]] % lws_values[lws_idx]) lws_idx++;
|
||||
|
||||
lws[priority_order[i]] = lws_values[lws_idx];
|
||||
|
||||
total_lws *= lws_values[lws_idx];
|
||||
}
|
||||
}
|
||||
|
||||
return lws;
|
||||
|
@ -8,7 +8,6 @@
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
struct weight_bias_params;
|
||||
struct optional_params;
|
||||
struct WeightsReorderParams;
|
||||
@ -24,6 +23,11 @@ bool UpdateWeightsParams(weight_bias_params& newParams,
|
||||
bool rotate = false);
|
||||
JitConstants GetTensorFriendlyWorkGroupsJit(const DataTensor& t);
|
||||
std::vector<size_t> GetTensorFriendlyWorkGroups(const DataTensor& t);
|
||||
std::vector<size_t> GetOptimalLocalWorkGroupSizes(std::vector<size_t> gws, const EngineInfo& info);
|
||||
std::vector<size_t> GetOptimalLocalWorkGroupSizes(std::vector<size_t> gws, const EngineInfo& info,
|
||||
DataLayout input_layout = DataLayout::bfyx, DataLayout output_layout = DataLayout::bfyx,
|
||||
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws =
|
||||
{{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y },
|
||||
{ Tensor::DataChannelName::FEATURE },
|
||||
{ Tensor::DataChannelName::BATCH }});
|
||||
bool CheckInputsOutputNoPitchSameDims(const base_params& params);
|
||||
} // namespace kernel_selector
|
||||
|
@ -24,15 +24,16 @@ std::string toString(const kernel_selector::CommonDispatchData& dispatchData) {
|
||||
return os.str();
|
||||
}
|
||||
|
||||
void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData) {
|
||||
void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData,
|
||||
const size_t maxWorkGroupSize) {
|
||||
if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3)
|
||||
throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + ": " +
|
||||
": LWS and GWS size is expected to be equal to 3. Actual: " +
|
||||
toString(dispatchData));
|
||||
|
||||
if (dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2] > 256) {
|
||||
if (dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2] > maxWorkGroupSize) {
|
||||
throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName +
|
||||
": LWS cannot be greater than 256. Actual: " +
|
||||
": LWS cannot be greater than " + std::to_string(static_cast<int>(maxWorkGroupSize)) + ". Actual: " +
|
||||
toString(dispatchData));
|
||||
}
|
||||
for (size_t i = 0; i < dispatchData.gws.size(); i++) {
|
||||
|
@ -61,7 +61,8 @@ protected:
|
||||
static const primitive_db db;
|
||||
const std::string kernelName;
|
||||
|
||||
static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData);
|
||||
static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData,
|
||||
const size_t maxWorkGroupSize);
|
||||
virtual Datatype GetUnitType(const base_params& params) const;
|
||||
|
||||
bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const;
|
||||
|
@ -199,10 +199,6 @@ device_info init_device_info(const cl::Device& device) {
|
||||
|
||||
info.max_work_group_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
|
||||
|
||||
// looks like WA. Do we still need it?
|
||||
if (info.max_work_group_size > 256)
|
||||
info.max_work_group_size = 256;
|
||||
|
||||
info.max_local_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>());
|
||||
info.max_global_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>());
|
||||
info.max_alloc_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>());
|
||||
|
Loading…
Reference in New Issue
Block a user