diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h index 8e945a6f0fb..725da375516 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h @@ -231,6 +231,7 @@ inline bool SimpleLayout(DataLayout l) { case DataLayout::yxfb: case DataLayout::byxf: case DataLayout::fyxb: + case DataLayout::bfxy: case DataLayout::bfzyx: case DataLayout::bfwzyx: return true; @@ -267,7 +268,7 @@ inline bool IsDynamicLSTMType(WeightsLayout l) { } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// Tensor Exaplnation +// Tensor Explanation //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // // resource - 80x80 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp index 48249760d63..020fe9aaf1d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp @@ -11,12 +11,17 @@ namespace kernel_selector { ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activation_params& arg) const { const auto& out = arg.output; + auto in_layout = arg.inputs[0].GetLayout(); + auto out_layout = arg.output.GetLayout(); DispatchData dispatchData; - if (out.GetLayout() == DataLayout::yxfb) { + if (out_layout == DataLayout::yxfb) { dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v}; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo); - } else if (out.GetLayout() == DataLayout::b_fs_yx_fsv16 || out.GetLayout() == DataLayout::b_fs_yx_fsv32) { + std::vector> dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}, + {Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}}; + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws); + } else if (out_layout == DataLayout::b_fs_yx_fsv16 || out_layout == DataLayout::b_fs_yx_fsv32) { dispatchData.gws = {Align(out.Feature().v, 16) * out.Batch().v, out.X().v, out.Y().v}; dispatchData.lws = {16, 1, 1}; } else if (out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || out.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32) { @@ -24,7 +29,10 @@ ActivationKernelBase::DispatchData ActivationKernelBase::SetDefault(const activa dispatchData.lws = {1, 16, 16}; } else { dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v}; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo); + std::vector> dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws); } return dispatchData; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.cpp index 7c02191576e..c23599ce74f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/arg_max_min/arg_max_min_kernel_axis.cpp @@ -80,6 +80,7 @@ KernelsData ArgMaxMinKernelAxis::GetKernelsData(const Params& params, const opti size_t sort_size = orgParams.argMaxMinSortType == ArgMaxMinSortType::VALUE ? getSortSize(orgParams) : 1; DispatchData dispatchData; + dispatchData.gws = { Align(getOperationNumber(orgParams), 32), sort_size, 1 }; dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp index a28500034e2..f34f2045b23 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_to_space/batch_to_space_kernel_base.cpp @@ -34,8 +34,15 @@ CommonDispatchData BatchToSpaceKernelBase::SetDefault(const batch_to_space_param dispatchData.gws = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v }; dispatchData.lws = { 1, 16, 1 }; } else { + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = out.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, + Tensor::DataChannelName::Z, Tensor::DataChannelName::W }}; + dispatchData.gws = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); } return dispatchData; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/binary_convolution/binary_convolution_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/binary_convolution/binary_convolution_kernel_base.cpp index 142fda4352f..faf87893020 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/binary_convolution/binary_convolution_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/binary_convolution/binary_convolution_kernel_base.cpp @@ -68,16 +68,25 @@ bool BinaryConvolutionKernelBase::CheckWorkGroups(const BinaryConvolutionKernelB BinaryConvolutionKernelBase::DispatchData BinaryConvolutionKernelBase::SetDefault(const binary_convolution_params& params, int) const { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; const auto& out = params.output; std::vector global; - if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) { + if (out_layout == DataLayout::bfyx || out_layout == DataLayout::byxf) { global = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; } else { global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v}; + dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}, + {Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}}; } - auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); + auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo, in_layout, out_layout, dims_by_gws); dispatchData.gws = global; dispatchData.lws = local; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.cpp index e63a56b1947..059569b29da 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.cpp @@ -22,9 +22,14 @@ BorderKernelBase::DispatchData BorderKernelBase::SetDefault(const border_params& const auto& output = params.output; DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::W }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp index 23b7bfcf54f..cffb2ea5972 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/broadcast/broadcast_kernel_base.cpp @@ -19,9 +19,14 @@ BroadcastKernelBase::DispatchData BroadcastKernelBase::SetDefault(const broadcas const auto& output = params.output; DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; dispatchData.gws = { output.X().v, output.Y().v * output.Z().v, output.Batch().v * output.Feature().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_simple_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_simple_ref.cpp index 7832297e27a..f6acf813ddc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_simple_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_simple_ref.cpp @@ -86,7 +86,14 @@ ConcatenationKernelBase::DispatchData ConcatenationKernel_simple_Ref::SetDefault dispatchData.gws = { input.X().v * input.Y().v, input.Z().v * input.W().v, input.Feature().v * input.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::Z, Tensor::DataChannelName::W }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; + + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp index b168508d60e..cf379ea5a20 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp @@ -20,6 +20,7 @@ namespace kernel_selector { Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Convolution_kernel_b_fs_yx_fsv16_imad_1x1() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_1x1") { + // TODO: can be potentially improved for GPUs with support of LWS > 256 constexpr size_t max_block_elements = 32; for (size_t bs = 1; bs <= 2 * simd; ++bs) { for (size_t bf = 1; bf <= 4; ++bf) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp index e419e88d6d6..1441132b9a6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp @@ -15,6 +15,7 @@ ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::ConvolutionKernel_b_fs_yx_fsv_16_32 std::vector simd_sizes = { 8, 16 }; std::vector exe_modes = ConvolutionKernelBase::autoTuneOptions; + // TODO: can be potentially improved for GPUs with support of LWS > 256 constexpr size_t max_block_size = 32 * 8; constexpr size_t max_lws_size = 256; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp index ca7c2dab7fe..42f926d67f1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp @@ -146,17 +146,29 @@ bool ConvolutionKernelBase::CheckPitchForSplitOnly(const convolution_params& par ConvolutionKernelBase::DispatchData ConvolutionKernelBase::SetDefault(const convolution_params& params, int) const { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; const auto& out = params.output; - if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) { + if (out_layout == DataLayout::bfyx || out_layout == DataLayout::byxf) { dispatchData.gws = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v}; - } else if (params.output.GetLayout() == DataLayout::bfzyx) { + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } else if (out_layout == DataLayout::bfzyx) { dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; } else { dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v}; + dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}, + {Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}}; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); dispatchData.cldnnStyle.blockWidth = 1; dispatchData.cldnnStyle.blockHeight = 1; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_dw.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_dw.cpp index 0823f40e869..51e0f8ca7a3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_dw.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_dw.cpp @@ -351,7 +351,13 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_b_fs_yx_fsv4_dw::SetD if (autoTuneParam.tiled) { dispatchData.lws[0] = autoTuneParam.tiled_simd; } else { - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; + + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); } dispatchData.gemmStyle = { 0, 0, 0, 0, 0, 0 }; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32_dw.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32_dw.cpp index bac9526d728..b9893fcc3d5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32_dw.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32_dw.cpp @@ -66,9 +66,14 @@ bool ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::Validate(const Params& p, const op ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::SetDefault(const convolution_params& cp, int /*autoTuneIndex*/) const { DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp); + auto in_layout = cp.inputs[0].GetLayout(); + auto out_layout = cp.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::BATCH }}; dispatchData.gws = { cp.output.Feature().v, cp.output.X().v * cp.output.Y().v, cp.output.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, cp.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, cp.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.cpp index ae3b01f9681..951a67d58c3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_ref.cpp @@ -99,8 +99,14 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_Ref::SetDefault(const conv // Just set the correct value for a particular implementation here, // until the whole hierarchy is re-written. const auto& out = params.output; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v}; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp index 42b573e5c61..204a8f3c460 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/cum_sum/cum_sum_kernel_base.cpp @@ -72,10 +72,16 @@ JitConstants CumSumKernelBase::GetJitConstants(const cum_sum_params& params, Dis CumSumKernelBase::DispatchData CumSumKernelBase::SetDefault(const cum_sum_params& params) const { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::W, Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; + dispatchData.gws = { params.output.Batch().v, params.output.Feature().v * params.output.W().v, params.output.Z().v * params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp index 384efdde91c..f8e22436759 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp @@ -51,6 +51,11 @@ WeightsLayout DeconvolutionKernel_imad_ref::GetPreferredWeightsLayout(const deco DeconvolutionKernelBase::DispatchData DeconvolutionKernel_imad_ref::SetDefault(const deconvolution_params& params) const { DispatchData dispatchData = Parent::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::BATCH }}; dispatchData.gws = { params.output.Feature().v, @@ -58,7 +63,7 @@ DeconvolutionKernelBase::DispatchData DeconvolutionKernel_imad_ref::SetDefault(c params.output.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp index fd2c7135327..dd3af1c0c27 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp @@ -30,13 +30,18 @@ ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const { CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; dispatchData.gws = { params.output.Batch().v, params.output.Feature().v, params.output.Z().v * params.output.Y().v * params.output.X().v }; // this kernel only supports bfyx and b_fs_yx_fsv16 layout. - dispatchData.lws = GetOptimalLocalWorkGroupSizes({1, dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp index 85c98f48a38..67f599570a8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp @@ -258,7 +258,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const auto jit = CreateJit(kernelName, cldnnJit, entryPoint); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entryPoint, params.engineInfo); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp index 401452d5516..1e2addd2041 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp @@ -268,12 +268,17 @@ JitConstants EltwiseKernel_b_fs_yx_fsv4::GetJitConstants(const eltwise_params& p EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv4::SetDefault(const eltwise_params& params) const { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE}, + {Tensor::DataChannelName::BATCH}}; dispatchData.gws[0] = params.output.X().v * params.output.Y().v; dispatchData.gws[1] = CeilDiv(params.output.Feature().v, 4); dispatchData.gws[2] = params.output.Batch().v; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); dispatchData.lws[1] = 1; dispatchData.lws[2] = 1; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp index 96c7590ddd0..9874507ae0b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp @@ -585,7 +585,9 @@ EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_para auto local = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo); - const size_t optimal_lws_values[] = {256, 224, 192, 160, 128, 96, 64, 32, 16}; + // TODO: can be potentially improved for GPUs with support of LWS > 256 + const size_t optimal_lws_values[] = { 256, 224, 192, 160, 128, 96, 64, 32, 16 }; + if ((params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 || params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 || params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp index e8d228023c6..0ffdce34630 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embedding_bag/embedding_bag_kernel_ref.cpp @@ -32,11 +32,16 @@ JitConstants EmbeddingBagKernelRef::GetJitConstants(const embedding_bag_params& CommonDispatchData EmbeddingBagKernelRef::SetDefault(const embedding_bag_params& params) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; dispatchData.gws = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } @@ -84,8 +89,6 @@ ParamsKey EmbeddingBagKernelRef::GetSupportedKey() const { k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfxy); - k.EnableAllInputLayout(); k.EnableAllOutputLayout(); k.EnableTensorOffset(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/extract_image_patches/extract_image_patches_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/extract_image_patches/extract_image_patches_kernel_base.cpp index 38a89e948bb..a883ed9ec0b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/extract_image_patches/extract_image_patches_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/extract_image_patches/extract_image_patches_kernel_base.cpp @@ -42,11 +42,16 @@ JitConstants ExtractImagePatchesKernelBase::GetJitConstants(const extract_image_ ExtractImagePatchesKernelBase::DispatchData ExtractImagePatchesKernelBase::SetDefault(const extract_image_patches_params& params) const { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; dispatchData.gws = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp index ad717d00891..ae27cfca8e4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp @@ -90,9 +90,16 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get bool slm_div_factor_exception = input_batch == 300 && input_feature == 2048 && output_batch == 300 && (output_feature == 324 || output_feature == 81); + bool big_wgs_exception = params.engineInfo.computeUnitsCount == 96 && params.engineInfo.maxThreadsPerExecutionUnit == 7 && + input_feature == 9216 && output_feature == 4096; + + size_t max_work_group_size = params.engineInfo.maxWorkGroupSize; + if (max_work_group_size > 256 && !big_wgs_exception) + max_work_group_size = 256; + if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception) while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 && - (tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size)) + (tuning_data.slm_div_factor * 2 <= max_work_group_size / tuning_data.sub_group_size)) tuning_data.slm_div_factor *= 2; tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp index c628dc98fdd..6d273cc487c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp @@ -207,18 +207,26 @@ fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_base::Set const fused_conv_eltwise_params& params, int) const { DispatchData dispatchData; - + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; const auto& out = params.output; - if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf || - params.output.GetLayout() == DataLayout::bfzyx || params.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 || - params.output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) { - dispatchData.gws = {out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v}; + if (out_layout == DataLayout::bfyx || out_layout == DataLayout::byxf || + out_layout == DataLayout::bfzyx || out_layout == DataLayout::b_fs_zyx_fsv16 || + out_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) { + dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v }; + dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; } else { - dispatchData.gws = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v * out.Z().v }; + dispatchData.gws = { out.Feature().v * out.Batch().v, out.X().v, out.Y().v * out.Z().v }; + dims_by_gws = {{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); dispatchData.cldnnStyle.blockWidth = 1; dispatchData.cldnnStyle.blockHeight = 1; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_elements_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_elements_kernel_ref.cpp index eb01e12a12f..191581c1b81 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_elements_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_elements_kernel_ref.cpp @@ -69,20 +69,35 @@ static inline std::vector GetDefaultOrder(size_t size) { CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; const auto& output = params.output; switch (params.inputs[1].GetLayout()) { case DataLayout::bfyx: dispatchData.gws = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + break; case DataLayout::bfzyx: dispatchData.gws = {output.X().v, output.Y().v * output.Z().v, output.Feature().v * output.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + break; case DataLayout::bfwzyx: dispatchData.gws = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::Z, Tensor::DataChannelName::W}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + break; default: @@ -90,7 +105,7 @@ CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_par break; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp index ce4a6036cd9..b6115b8f356 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gather/gather_kernel_ref.cpp @@ -155,16 +155,28 @@ static std::string GetIndecesIdxOrder(const gather_params& params, size_t axis, CommonDispatchData GatherKernelRef::SetDefault(const gather_params& params, const optional_params&) const { CommonDispatchData dispatchData; const auto& output = params.output; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; - if (output.GetLayout() == DataLayout::bfyx) { + if (out_layout == DataLayout::bfyx) { dispatchData.gws = {output.X().v, output.Y().v, output.Feature().v * output.Batch().v}; - } else if (output.GetLayout() == DataLayout::bfzyx) { + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + } else if (out_layout == DataLayout::bfzyx) { dispatchData.gws = {output.X().v, output.Y().v * output.Z().v, output.Feature().v * output.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; } else { dispatchData.gws = {output.X().v * output.Y().v, output.Z().v * output.W().v, output.Feature().v * output.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::Z, Tensor::DataChannelName::W}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/grn/grn_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/grn/grn_kernel_base.cpp index 3e332e0cff2..662757d0edc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/grn/grn_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/grn/grn_kernel_base.cpp @@ -16,10 +16,15 @@ JitConstants GRNKernelBase::GetJitConstants(const grn_params& params, GRNKernelB GRNKernelBase::DispatchData GRNKernelBase::SetDefault(const grn_params& params) const { const auto& output = params.output; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::X }}; DispatchData dispatchData; dispatchData.gws = { output.Batch().v, output.Y().v, output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features_fsv16.cpp index 8e3834b03af..37ad41cc630 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_multiple_features_fsv16.cpp @@ -34,6 +34,11 @@ ParamsKey LRNKernelAcrossChannelMultipleFeaturesFSV16::GetSupportedKey() const { CommonDispatchData LRNKernelAcrossChannelMultipleFeaturesFSV16::SetDefault(const lrn_params& params) const { CommonDispatchData dispatchData = LRNKernelBase::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::BATCH }}; const auto& out = params.output; const unsigned int alignment = 16; @@ -41,7 +46,7 @@ CommonDispatchData LRNKernelAcrossChannelMultipleFeaturesFSV16::SetDefault(const dispatchData.gws = { Align(out.Feature().v, alignment), out.X().v, out.Y().v * out.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.cpp index bfd02f0859c..a81ad854d32 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_ref.cpp @@ -61,11 +61,16 @@ JitConstants LRNKernelRef::GetJitConstants(const lrn_params& params, const LRNKe LRNKernelRef::Parent::DispatchData LRNKernelRef::SetDefault(const lrn_params& params) const { DispatchData dispatchData = Parent::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::BATCH }}; const auto& out = params.output; dispatchData.gws = { out.X().v * out.Y().v, out.Feature().v, out.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.cpp index 0f43916899b..b08476ecd62 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_within_channel_byxf_opt.cpp @@ -58,11 +58,16 @@ JitConstants LRNKernelWithinChannelByxfOpt::GetJitConstants(const lrn_params& pa LRNKernelWithinChannelByxfOpt::Parent::DispatchData LRNKernelWithinChannelByxfOpt::SetDefault( const lrn_params& params) const { DispatchData dispatchData = Parent::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::BATCH }}; const auto& out = params.output; dispatchData.gws = { out.X().v * out.Y().v, CeilDiv(out.Feature().v, 8), out.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_bfyx_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_bfyx_opt.cpp index ad73129bff5..5f2bc7f2e68 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_bfyx_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_bfyx_opt.cpp @@ -68,11 +68,17 @@ KernelsData LSTM_DynamicInputKernelBfyxOpt::GetKernelsData(const Params& params, KernelData kd = KernelData::Default(params); lstm_dynamic_input_params& dlstm_params = *static_cast(kd.params.get()); + auto in_layout = dlstm_params.inputs[0].GetLayout(); + auto out_layout = dlstm_params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }}; + const auto& out = dlstm_params.output; auto hidden_size = out.X().v; dispatchData.gws = { hidden_size / simd_size, out.Batch().v * out.Y().v, out.Feature().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); bool succeed = UpdateWeightsParams(dlstm_params, options, diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_kernel_base.cpp index cdfbe15aa26..e6c56f463cc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lstm_dynamic/lstm_dynamic_input_kernel_base.cpp @@ -26,11 +26,17 @@ JitConstants LSTM_DynamicInputKernelBase::GetJitConstants(const lstm_dynamic_inp LSTM_DynamicInputKernelBase::DispatchData LSTM_DynamicInputKernelBase::SetDefault( const lstm_dynamic_input_params& params) { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }}; + const auto& out = params.output; // 4 * hidden, batch * dir, seq_len dispatchData.gws = { out.X().v, out.Batch().v * out.Y().v, out.Feature().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp index 4f6bc44facd..38634da1b0e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/non_max_suppression/non_max_suppression_kernel_ref.cpp @@ -282,7 +282,7 @@ KernelsData NonMaxSuppressionKernelRef::GetKernelsData(const Params& params, con auto jit = CreateJit(kernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[i]; - KernelBase::CheckDispatchData(kernelName, dispatchData); + KernelBase::CheckDispatchData(kernelName, dispatchData, params.engineInfo.maxWorkGroupSize); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp index 1b18f86367f..6cee4fda8c3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/one_hot/one_hot_kernel_base.cpp @@ -22,14 +22,23 @@ JitConstants OneHotKernelBase::GetJitConstants(const one_hot_params& params) con OneHotKernelBase::DispatchData OneHotKernelBase::SetDefault(const one_hot_params& params) { const auto& input = params.inputs[0]; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; DispatchData dispatchData; if (params.output.GetDims().size() == 5) { dispatchData.gws = { input.Batch().v, input.Feature().v * input.Z().v, input.Y().v * input.X().v }; + dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::Z, Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; } else { dispatchData.gws = { input.Batch().v, input.Feature().v, input.Y().v * input.X().v }; + dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp index ef97125be09..284a6ae028f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp @@ -31,10 +31,16 @@ ParamsKey PermuteKernelRef::GetSupportedKey() const { CommonDispatchData PermuteKernelRef::SetDefault(const permute_params& params) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z, Tensor::DataChannelName::W}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; + const auto& in = params.inputs[0]; dispatchData.gws = {in.X().v, in.Y().v * in.Z().v * in.W().v, in.Feature().v * in.Batch().v}; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4.cpp index ad39e79dc9c..fdb3d75c163 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_tile_8x8_4x4.cpp @@ -199,9 +199,10 @@ static std::vector GetBestLwsFromGws(const permute_params& params, const std::vector dims{0, 2, 1}; // SLM size: elemsize * tile_size * tile_size * work_items <= 64K - size_t elem_size = params.output.ElementSize(); - size_t max_local_mem_size = params.engineInfo.maxLocalMemSize; - size_t max_num_work_items = std::min((size_t)256, (size_t)max_local_mem_size / (elem_size * tile_size * tile_size)); + const size_t elem_size = params.output.ElementSize(); + const size_t max_local_mem_size = params.engineInfo.maxLocalMemSize; + const size_t max_work_group_size = params.engineInfo.maxWorkGroupSize; + size_t max_num_work_items = std::min(max_work_group_size, max_local_mem_size / (elem_size * tile_size * tile_size)); for (size_t i = 0; i < dims.size(); ++i) { size_t dim = dims[i]; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp index 46fc97ddbca..88550ff91e8 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp @@ -33,12 +33,17 @@ ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const { PoolingKernelBase::DispatchData PoolingKerneGPU_b_fs_yx_fsv4::SetDefault(const pooling_params& params) const { DispatchData dispatchData = PoolingKernelBase::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; dispatchData.gws[0] = params.output.X().v; // X dispatchData.gws[1] = params.output.Y().v; // Y // we got b_fs_yx_fsv4 format, we process 4 features per workitem dispatchData.gws[2] = CeilDiv(params.output.Feature().v, 4) * params.output.Batch().v; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp index 15f39ee8b91..71544ea144c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp @@ -36,6 +36,9 @@ ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const { PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault(const pooling_params& params) const { DispatchData dispatchData = PoolingKernelBase::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; const auto& out = params.output; auto x = out.X().v; @@ -61,7 +64,10 @@ PoolingKernelBase::DispatchData PoolingKernelGPU_b_fs_zyx_fsv16_imad::SetDefault // we got b_fs_yx_fsv16 format, we process 16 features per workitem dispatchData.gws[2] = CeilDiv(f, FEATURE_SLICE_SIZE) * b; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); } return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp index 5bd147a37b7..b69f7d114cc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pyramid_roi_align/pyramid_roi_align_kernel_ref.cpp @@ -33,13 +33,18 @@ ParamsKey PyramidROIAlignKernelRef::GetSupportedKey() const { PyramidROIAlignKernelBase::DispatchData PyramidROIAlignKernelRef::SetDefault(const PyramidROIAlign_params& params) const { auto dispatchData = PyramidROIAlignKernelBase::SetDefault(params); + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::BATCH }}; dispatchData.gws = { params.output.X().v * params.output.Y().v, params.output.Feature().v, params.output.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp index c3172c6fda4..fd3844272b9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp @@ -32,11 +32,16 @@ ParamsKey ReduceKernelRef::GetSupportedKey() const { CommonDispatchData ReduceKernelRef::SetDefault(const reduce_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::Z, Tensor::DataChannelName::W }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; dispatchData.gws = { params.output.X().v * params.output.Y().v, params.output.Z().v * params.output.W().v, params.output.Batch().v * params.output.Feature().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_binary.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_binary.cpp index 62af69c727c..d1e1258f8ad 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_binary.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_binary.cpp @@ -50,11 +50,16 @@ JitConstants ReorderKernelBinary::GetJitConstants(const reorder_params& params) ReorderKernelBinary::DispatchData ReorderKernelBinary::SetDefault(const reorder_params& params) const { DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; const auto& input = params.inputs[0]; dispatchData.gws = { input.Batch().v, CeilDiv(input.Feature().v, 32), input.Y().v * input.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.cpp index 7a07f52d351..46170734b0e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorg_yolo/reorg_yolo_kernel_ref.cpp @@ -33,14 +33,23 @@ JitConstants ReorgYoloKernelRef::GetJitConstants(const reorg_yolo_params& ry) co } ReorgYoloKernelRef::DispatchData SetDefault(const reorg_yolo_params& params) { ReorgYoloKernelRef::DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; const auto& input = params.inputs[0]; if (input.GetLayout() == DataLayout::bfyx) { dispatchData.gws = {input.X().v, input.Y().v, input.Feature().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE}}; } else { dispatchData.gws = {input.Feature().v * input.Batch().v, input.X().v, input.Y().v}; + dims_by_gws = {{Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}, + {Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}}; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp index 7b8079c0770..2af23d5f6d3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_base.cpp @@ -49,18 +49,35 @@ size_t ResampleKernelBase::GetFeatureBlockSize(const resample_params& params) co ResampleKernelBase::DispatchData ResampleKernelBase::SetDefault(const kernel_selector::resample_params &arg) const { DispatchData dispatchData; + auto in_layout = arg.inputs[0].GetLayout(); + auto out_layout = arg.output.GetLayout(); + std::vector> dims_by_gws; + const auto& out = arg.output; - if (arg.resampleType == ResampleType::NEAREST_NEIGHBOR) + if (arg.resampleType == ResampleType::NEAREST_NEIGHBOR) { dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v }; - else if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX) + dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; + } else if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX) { dispatchData.gws = { Align(out.X().v, 32), out.Y().v, out.Batch().v }; - else if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) + dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::BATCH }}; + } else if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) { dispatchData.gws = { out.X().v * out.Y().v, CeilDiv(out.Feature().v, GetFeatureBlockSize(arg)), out.Batch().v * out.Z().v }; - else + dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::Z, Tensor::DataChannelName::BATCH }}; + } else { dispatchData.gws = { out.X().v, out.Y().v * out.Z().v, out.Feature().v * out.Batch().v }; + dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; + } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws); if (arg.resampleType == ResampleType::BILINEAR_INTERP || arg.resampleType == ResampleType::LINEAR_ONNX) { dispatchData.lws[0] = 32; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp index 69bb21d2b64..d0ce4e9dcef 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_opt.cpp @@ -62,6 +62,10 @@ ParamsKey ResampleKernelOpt::GetSupportedKey() const { ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_selector::resample_params &arg) const { DispatchData dispatchData; + auto in_layout = arg.inputs[0].GetLayout(); + auto out_layout = arg.output.GetLayout(); + std::vector> dims_by_gws; + const auto& out = arg.output; if (arg.resampleType == ResampleType::CAFFE_BILINEAR_INTERP) { @@ -69,7 +73,10 @@ ResampleKernelBase::DispatchData ResampleKernelOpt::SetDefault(const kernel_sele dispatchData.gws[1] = CeilDiv(out.Feature().v, GetFeatureBlockSize(arg)); dispatchData.gws[2] = arg.output.Batch().v; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo); + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE}, + {Tensor::DataChannelName::BATCH}}; + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws); } else { auto opt_x_block_size = GetOptimalBlockSize(arg); if (out.X().v > 32 && opt_x_block_size == 1) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp index fad6d77dd06..7cf2a0cc293 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp @@ -116,11 +116,16 @@ JitConstants ResampleKernelRef::GetJitConstants(const resample_params& params) c ResampleKernelBase::DispatchData ResampleKernelRef::SetDefault(const resample_params& arg) const { auto dispatchData = Parent::SetDefault(arg); + auto in_layout = arg.inputs[0].GetLayout(); + auto out_layout = arg.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X }, + { Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; if (use_packing(arg)) { auto pack = packing_factor(arg); dispatchData.gws = { arg.output.X().v, arg.output.Y().v * arg.output.Z().v, CeilDiv(arg.output.Feature().v, pack) * arg.output.Batch().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, arg.engineInfo, in_layout, out_layout, dims_by_gws); } return dispatchData; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp index 78f92d798fb..f155085b0d5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reverse_sequence/reverse_sequence_kernel_ref.cpp @@ -32,12 +32,17 @@ ParamsKey ReverseSequenceKernelRef::GetSupportedKey() const { CommonDispatchData ReverseSequenceKernelRef::SetDefault(const reverse_sequence_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; dispatchData.gws = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp index 50027615a04..68b8295a1e9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scatter_update/scatter_elements_update_kernel_ref.cpp @@ -70,6 +70,10 @@ static inline std::vector GetDefaultOrder(size_t size) { CommonDispatchData ScatterElementsUpdateKernelRef::SetDefault(const scatter_elements_update_params& params, const optional_params&, bool is_second) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; + const auto& output = params.output; const auto& indices = params.inputs[1]; @@ -78,21 +82,30 @@ CommonDispatchData ScatterElementsUpdateKernelRef::SetDefault(const scatter_elem switch (params.inputs[0].GetLayout()) { case DataLayout::bfyx: dispatchData.gws = {scope.X().v, scope.Y().v, scope.Feature().v * scope.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X}, + {Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; break; case DataLayout::bfzyx: dispatchData.gws = {scope.X().v * scope.Y().v, scope.Z().v, scope.Feature().v * scope.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::Z}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; break; case DataLayout::bfwzyx: dispatchData.gws = {scope.X().v * scope.Y().v, scope.Z().v * scope.W().v, scope.Feature().v * scope.Batch().v}; + dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::Z, Tensor::DataChannelName::W}, + {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}}; break; default: throw std::invalid_argument("Unsupported data layout for scatter elements update primitive"); break; } - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp index 0c2c2d79552..3ef3097df40 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/shuffle_channels/shuffle_channels_kernel_ref.cpp @@ -44,11 +44,16 @@ bool ShuffleChannelsKernelRef::Validate(const Params& p, const optional_params& CommonDispatchData ShuffleChannelsKernelRef::SetDefault(const shuffle_channels_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y }}; dispatchData.gws = { params.output.Batch().v, params.output.Feature().v, params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp index 83b58c27774..fcab0c7dc0d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_batch/space_to_batch_kernel_base.cpp @@ -28,14 +28,21 @@ bool SpaceToBatchKernelBase::Validate(const Params& p, const optional_params& o) CommonDispatchData SpaceToBatchKernelBase::SetDefault(const space_to_batch_params& params, const optional_params&) const { const auto& out = params.output; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws; CommonDispatchData dispatchData; if (out.GetLayout() == DataLayout::b_fs_yx_fsv16 && out.Feature().v % 16 == 0) { dispatchData.gws = { out.Batch().v, out.Feature().v, out.Y().v * out.X().v }; - dispatchData.lws = {1, 16, 1}; + dispatchData.lws = { 1, 16, 1 }; } else { dispatchData.gws = { out.Batch().v, out.Feature().v, out.W().v * out.Z().v * out.Y().v * out.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, + Tensor::DataChannelName::Z, Tensor::DataChannelName::W }}; + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); } return dispatchData; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp index 2975c1c40f0..8e32117802d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/space_to_depth/space_to_depth_kernel_ref.cpp @@ -48,11 +48,16 @@ bool SpaceToDepthKernelRef::Validate(const Params& p, const optional_params& o) CommonDispatchData SpaceToDepthKernelRef::SetDefault(const space_to_depth_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; dispatchData.gws = { params.output.Batch().v, params.output.Feature().v, params.output.Z().v * params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp index 46dd741815b..6326c208519 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp @@ -84,6 +84,11 @@ bool StridedSliceKernelRef::Validate(const Params& p, const optional_params& o) CommonDispatchData StridedSliceKernelRef::SetDefault(const strided_slice_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::BATCH }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::X, Tensor::DataChannelName::Y, Tensor::DataChannelName::Z }}; // If the new_axis_mask is set, then begin, end, and stride are ignored // and a new length 1 dimension is adding. Input data just copying to output @@ -92,7 +97,7 @@ CommonDispatchData StridedSliceKernelRef::SetDefault(const strided_slice_params& params.output.Feature().v, params.output.Z().v * params.output.Y().v * params.output.X().v }; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp index 40a81b62a5c..f8b7b0d45eb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp @@ -34,11 +34,16 @@ ParamsKey TileKernelRef::GetSupportedKey() const { CommonDispatchData TileKernelRef::SetDefault(const tile_params& params, const optional_params&) const { CommonDispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.output.GetLayout(); + std::vector> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::Z, Tensor::DataChannelName::W }, + { Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }}; auto out = params.output; - dispatchData.gws = {out.X().v * out.Y().v, out.Z().v * out.W().v, out.Batch().v * out.Feature().v}; - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); + dispatchData.gws = { out.X().v * out.Y().v, out.Z().v * out.W().v, out.Batch().v * out.Feature().v }; + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); return dispatchData; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp index a3cb28236ef..a95605641a4 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_base_opencl.cpp @@ -184,7 +184,7 @@ void KernelBaseOpenCL::FillCLKernelData(clKernelData& kernel, bool bias, int number_of_inputs, uint32_t number_of_inputs_for_fused_prims) const { - KernelBase::CheckDispatchData(kernelMapName, dispatchData); + KernelBase::CheckDispatchData(kernelMapName, dispatchData, engine_info.maxWorkGroupSize); kernel.code.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode); kernel.params.workGroups.global = dispatchData.gws; kernel.params.workGroups.local = dispatchData.lws; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp index 14a9e21e8c7..d230508ce91 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.cpp @@ -199,20 +199,213 @@ std::vector GetTensorFriendlyWorkGroups(const DataTensor& t) { return sizes; } -std::vector GetOptimalLocalWorkGroupSizes(std::vector gws, const EngineInfo& info) { - const size_t lws_max = info.maxWorkGroupSize; - const size_t optimal_lws_values[] = {256, 227, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 2, 1}; +std::vector GetOptimalLocalWorkGroupSizes(std::vector gws, const EngineInfo& info, + DataLayout input_layout, DataLayout output_layout, + std::vector> dims_by_gws) { + enum axis { x, y, z, w, f, b, unused_axis }; + + // GWS/LWS priority order should be considered for better local WGS setting + // and as a result more optimized data reading/writing inside kernels + std::vector priority_order = { 0, 1, 2 }; + std::vector layout_order = { x, y, z, w, f, b }; + + const size_t gws_dims_num = priority_order.size(); + const size_t axis_num = layout_order.size(); + size_t first_axis_idx = 0; + + std::vector axis_by_gws = { unused_axis, unused_axis, unused_axis, unused_axis, unused_axis, unused_axis }; + for (size_t gws_idx = 0; gws_idx < gws_dims_num; gws_idx++) { + for (size_t axis_idx = 0; axis_idx < dims_by_gws[gws_idx].size(); axis_idx++) { + axis_by_gws[static_cast(dims_by_gws[gws_idx][axis_idx])] = gws_idx; + } + } + + auto calculate_optimized_priority_order = [&]() -> void { + while (axis_by_gws[layout_order[first_axis_idx]] == unused_axis) + first_axis_idx++; + + for (size_t gws_idx = 0; gws_idx < gws_dims_num; gws_idx++) { + for (size_t axis_idx = first_axis_idx; axis_idx < axis_num; axis_idx++) { + if (axis_by_gws[layout_order[axis_idx]] != unused_axis) { + bool is_already_exists = false; + if (axis_idx > 0) { + for (int i = axis_idx - 1; i >= 0; i--) { + if (axis_by_gws[layout_order[axis_idx]] == axis_by_gws[layout_order[i]]) { + is_already_exists = true; + break; + } + } + } + first_axis_idx++; + if (!is_already_exists) { + priority_order[gws_idx] = axis_by_gws[layout_order[axis_idx]]; + break; + } + } + } + } + }; + + auto one_layout = input_layout == output_layout; + + auto simple_planar_layout = Tensor::SimpleLayout(output_layout); + + auto blocked_fsv_layout = output_layout == DataLayout::b_fs_yx_fsv4 || output_layout == DataLayout::fs_b_yx_fsv32 || + output_layout == DataLayout::b_fs_yx_fsv16 || output_layout == DataLayout::b_fs_zyx_fsv16 || + output_layout == DataLayout::b_fs_yx_fsv32 || output_layout == DataLayout::b_fs_zyx_fsv32; + + auto blocked_bsv_fsv_layout = output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16; + + auto try_change_priority_order = (simple_planar_layout || blocked_fsv_layout || blocked_bsv_fsv_layout) && one_layout; + + if (try_change_priority_order) { + if (simple_planar_layout) { + switch (output_layout) { + case DataLayout::bf: + layout_order = { f, b, x, y, z, w }; + break; + case DataLayout::fb: + layout_order = { b, f, x, y, z, w }; + break; + case DataLayout::bfyx: + layout_order = { x, y, f, b, z, w }; + break; + case DataLayout::yxfb: + layout_order = { b, f, x, y, z, w }; + break; + case DataLayout::byxf: + layout_order = { f, x, y, b, z, w }; + break; + case DataLayout::fyxb: + layout_order = { b, x, y, f, z, w }; + break; + case DataLayout::bfxy: + layout_order = { y, x, f, b, z, w }; + break; + case DataLayout::bfzyx: + layout_order = { x, y, z, f, b, w }; + break; + case DataLayout::bfwzyx: + layout_order = { x, y, z, w, f, b }; + break; + default: + layout_order = { x, y, z, w, f, b }; + break; + } + } else if (blocked_fsv_layout) { + if (output_layout == DataLayout::b_fs_yx_fsv4 || output_layout == DataLayout::b_fs_yx_fsv16 || output_layout == DataLayout::b_fs_yx_fsv32) + layout_order = { f, x, y, b, z, w }; + else if (output_layout == DataLayout::b_fs_zyx_fsv16 || output_layout == DataLayout::b_fs_zyx_fsv32) + layout_order = { f, x, y, z, b, w }; + else // output_layout == DataLayout::fs_b_yx_fsv32 + layout_order = { f, x, y, b, z, w }; + } else if (blocked_bsv_fsv_layout) { + layout_order = { f, b, x, y, z, w }; + } + + calculate_optimized_priority_order(); + + // Revert basic priority if something is wrong + if (priority_order[0] == priority_order[1] || priority_order[0] == priority_order[2] || priority_order[1] == priority_order[2] || + priority_order[0] > 2 || priority_order[1] > 2 || priority_order[2] > 2) { + priority_order = { 0, 1, 2 }; + } + } + + size_t lws_max = info.maxWorkGroupSize; + const size_t optimal_lws_values[] = { 1024, 960, 896, 832, 768, 704, 640, 576, + 512, 480, 448, 416, 384, 352, 320, 288, + 256, 227, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 2, 1 }; + const size_t suboptimal_lws_values[] = { 1024, 960, 896, 832, 768, 704, 640, 576, + 512, 480, 448, 416, 384, 352, 320, 288, + 256, 227, 224, 192, 160, 128, 96, 64, 32, 16, + 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }; + + size_t first_lws_idx = lws_max == 1024 ? 0: + lws_max == 512 ? 8: + 16; + // Reduces max local wgs for some cases on Gen12+ devices + if (lws_max >= 512) { + auto two_dims_are_odd_and_equal = (gws[0] % 2 && gws[0] > 7 && (gws[0] == gws[1] || gws[0] == gws[2])) || + (gws[1] % 2 && gws[1] > 7 && gws[1] == gws[2]); + + // Known cases when lws_max = 256 works better than lws_max > 256 + auto max_wgs_exception1 = gws[priority_order[0]] == 1278 && gws[priority_order[1]] == 718 && gws[priority_order[2]] % 10 == 0; + auto max_wgs_exception2 = gws[priority_order[0]] == 28 && gws[priority_order[1]] == 168 && gws[priority_order[2]] == 128; + auto max_wgs_exception3 = gws[priority_order[0]] == 1000 && gws[priority_order[1]] == 1 && gws[priority_order[2]] == 64; + auto max_wgs_exception4 = gws[priority_order[0]] == 180 && gws[priority_order[1]] == 320 && gws[priority_order[2]] == 56; + auto max_wgs_exception5 = gws[priority_order[0]] == 1 && gws[priority_order[1]] > 256 && gws[priority_order[2]] == 1; + auto max_wgs_exception6 = gws[priority_order[0]] == 64 && gws[priority_order[1]] == 16 && gws[priority_order[2]] == 1 && + priority_order[1] == 2 && priority_order[2] == 1; + if (two_dims_are_odd_and_equal || max_wgs_exception1 || max_wgs_exception2 || max_wgs_exception3 || max_wgs_exception4 || + max_wgs_exception5 || max_wgs_exception6) { + lws_max = 256; + first_lws_idx = 16; + } + } + size_t total_lws = 1; - std::vector lws; + size_t total_gws = 1; + std::vector lws = { 1, 1, 1 }; + for (size_t i = 0; i < gws.size(); ++i) { auto rest_lws = lws_max / total_lws; - size_t lws_idx = 0; - while (rest_lws < optimal_lws_values[lws_idx]) lws_idx++; + size_t lws_idx = first_lws_idx; + size_t max_optimal_lws0_value = lws_max; + if (try_change_priority_order && axis_by_gws[f] != unused_axis) { + if (output_layout == DataLayout::b_fs_yx_fsv16 || output_layout == DataLayout::b_fs_zyx_fsv16 || output_layout == DataLayout::fs_b_yx_fsv32) { + max_optimal_lws0_value = 16; + } else if (output_layout == DataLayout::b_fs_yx_fsv32 || output_layout == DataLayout::b_fs_zyx_fsv32) { + max_optimal_lws0_value = 32; + } else if ((output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) && + (axis_by_gws[b] == axis_by_gws[f])) { + max_optimal_lws0_value = 256; + } else if ((output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) && + (axis_by_gws[b] != axis_by_gws[f]) && (axis_by_gws[b] != unused_axis)) { + max_optimal_lws0_value = 16; + } + } - while (gws[i] % optimal_lws_values[lws_idx]) lws_idx++; + auto can_use_suboptimal_lws1 = (i == 1) && ((gws[priority_order[0]] % 32 == 0) || (gws[priority_order[0]] == 1 && gws[priority_order[2]] % 16 != 0)); + auto can_use_suboptimal_lws2 = (i == 2) && (total_lws == total_gws); + const size_t* lws_values = can_use_suboptimal_lws1 || can_use_suboptimal_lws2 ? + suboptimal_lws_values : + optimal_lws_values; - lws.push_back(optimal_lws_values[lws_idx]); - total_lws *= optimal_lws_values[lws_idx]; + while (rest_lws < lws_values[lws_idx]) lws_idx++; + if (i == 0) { + while (lws_values[lws_idx] > max_optimal_lws0_value) lws_idx++; + } + while (gws[priority_order[i]] % lws_values[lws_idx]) lws_idx++; + + if (lws_max == 256 || total_lws == total_gws) { + lws[priority_order[i]] = lws_values[lws_idx]; + } else { + lws[priority_order[i]] = i == 2 && gws[priority_order[0]] != 1 ? 1 : lws_values[lws_idx]; + if (total_gws > 100 && total_lws < 8 && i == 2) + lws[priority_order[i]] = lws_values[lws_idx]; + } + + total_lws *= lws_values[lws_idx]; + total_gws *= gws[priority_order[i]]; + } + + // For cases with lws { 1, 1, 1 } try to use suboptimal values to increase work group size + if (lws[0] == 1 && lws[1] == 1 && lws[2] == 1) { + total_lws = 1; + for (size_t i = 0; i < gws.size(); ++i) { + auto rest_lws = lws_max / total_lws; + size_t lws_idx = first_lws_idx; + + const size_t* lws_values = suboptimal_lws_values; + + while (rest_lws < lws_values[lws_idx]) lws_idx++; + while (gws[priority_order[i]] % lws_values[lws_idx]) lws_idx++; + + lws[priority_order[i]] = lws_values[lws_idx]; + + total_lws *= lws_values[lws_idx]; + } } return lws; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h index 018516b29b7..6444a8873f9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/kernel_selector_utils.h @@ -8,7 +8,6 @@ #include namespace kernel_selector { - struct weight_bias_params; struct optional_params; struct WeightsReorderParams; @@ -24,6 +23,11 @@ bool UpdateWeightsParams(weight_bias_params& newParams, bool rotate = false); JitConstants GetTensorFriendlyWorkGroupsJit(const DataTensor& t); std::vector GetTensorFriendlyWorkGroups(const DataTensor& t); -std::vector GetOptimalLocalWorkGroupSizes(std::vector gws, const EngineInfo& info); +std::vector GetOptimalLocalWorkGroupSizes(std::vector gws, const EngineInfo& info, + DataLayout input_layout = DataLayout::bfyx, DataLayout output_layout = DataLayout::bfyx, + std::vector> dims_by_gws = + {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Y }, + { Tensor::DataChannelName::FEATURE }, + { Tensor::DataChannelName::BATCH }}); bool CheckInputsOutputNoPitchSameDims(const base_params& params); } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp index d44c8d07553..a9a12b721bb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp @@ -24,15 +24,16 @@ std::string toString(const kernel_selector::CommonDispatchData& dispatchData) { return os.str(); } -void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData) { +void KernelBase::CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData, + const size_t maxWorkGroupSize) { if (dispatchData.gws.size() != 3 || dispatchData.lws.size() != 3) throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + ": " + ": LWS and GWS size is expected to be equal to 3. Actual: " + toString(dispatchData)); - if (dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2] > 256) { + if (dispatchData.lws[0] * dispatchData.lws[1] * dispatchData.lws[2] > maxWorkGroupSize) { throw std::runtime_error("ERROR: Invalid dispatch data for kernel: " + kernelName + - ": LWS cannot be greater than 256. Actual: " + + ": LWS cannot be greater than " + std::to_string(static_cast(maxWorkGroupSize)) + ". Actual: " + toString(dispatchData)); } for (size_t i = 0; i < dispatchData.gws.size(); i++) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h index c5d996104e5..c7bd0375c51 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h @@ -61,7 +61,8 @@ protected: static const primitive_db db; const std::string kernelName; - static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData); + static void CheckDispatchData(const std::string& kernelName, const kernel_selector::CommonDispatchData& dispatchData, + const size_t maxWorkGroupSize); virtual Datatype GetUnitType(const base_params& params) const; bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const; diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp index 821d790c822..85e6c4a5258 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp @@ -199,10 +199,6 @@ device_info init_device_info(const cl::Device& device) { info.max_work_group_size = static_cast(device.getInfo()); - // looks like WA. Do we still need it? - if (info.max_work_group_size > 256) - info.max_work_group_size = 256; - info.max_local_mem_size = static_cast(device.getInfo()); info.max_global_mem_size = static_cast(device.getInfo()); info.max_alloc_mem_size = static_cast(device.getInfo());