From 1e6fd56e0124aa30cf921c4d2c43583aed1c52d4 Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Wed, 29 Sep 2021 17:10:53 +0900 Subject: [PATCH] [GPU] Merge kernel updates (#7699) + fix a bug due to bias type mismatching - convolution_gpu_bfyx_to_bfyx_f16 + refactoring - depth_to_space_kernel_base - depth_to_space_kernel_ref + Adjusting LWS - eltwise_kernel_base --- .../depth_to_space/depth_to_space_kernel_base.cpp | 12 ------------ .../depth_to_space/depth_to_space_kernel_base.h | 2 +- .../depth_to_space/depth_to_space_kernel_ref.cpp | 12 ++++++++++++ .../depth_to_space/depth_to_space_kernel_ref.h | 1 + .../actual_kernels/eltwise/eltwise_kernel_base.cpp | 7 +++++++ .../cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl | 14 +++++++++++--- 6 files changed, 32 insertions(+), 16 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp index 2a1631d8d02..2d9d82dbe3d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp @@ -27,18 +27,6 @@ bool DepthToSpaceKernelBase::Validate(const Params& p, const optional_params& o) return true; } -CommonDispatchData DepthToSpaceKernelBase::SetDefault(const depth_to_space_params& params) const { - CommonDispatchData dispatchData; - - dispatchData.gws = { params.output.Batch().v, - params.output.Feature().v, - params.output.Z().v * params.output.Y().v * params.output.X().v }; - - dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); - - return dispatchData; -} - JitConstants DepthToSpaceKernelBase::GetJitConstants(const depth_to_space_params& params) const { JitConstants jit = MakeBaseParamsJitConstants(params); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h index fee66457080..7c4ac7e05b9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h @@ -45,7 +45,7 @@ public: protected: bool Validate(const Params&, const optional_params&) const override; virtual JitConstants GetJitConstants(const depth_to_space_params& params) const; - virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const; + virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const = 0; KernelsData GetCommonKernelsData(const Params& params, const optional_params&) const; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp index 8a247a2cc4d..fd2c7135327 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp @@ -28,6 +28,18 @@ ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const { return k; } +CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params) const { + CommonDispatchData dispatchData; + + dispatchData.gws = { params.output.Batch().v, + params.output.Feature().v, + params.output.Z().v * params.output.Y().v * params.output.X().v }; + + // this kernel only supports bfyx and b_fs_yx_fsv16 layout. + dispatchData.lws = GetOptimalLocalWorkGroupSizes({1, dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo); + return dispatchData; +} + KernelsData DepthToSpaceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { return GetCommonKernelsData(params, options); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h index 14f28944f19..9d15d0eabff 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h @@ -14,6 +14,7 @@ public: DepthToSpaceKernelRef() : DepthToSpaceKernelBase("depth_to_space_ref") {} virtual ~DepthToSpaceKernelRef() {} + CommonDispatchData SetDefault(const depth_to_space_params& params) const override; KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; ParamsKey GetSupportedKey() const override; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp index e4cf66c57bb..9d1a0bb601c 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp @@ -620,6 +620,13 @@ EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_para dispatchData.lws[1] = bs_fsv32_local[2]; dispatchData.lws[2] = bs_fsv32_local[0]; } + } else if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 && + (params.output.Feature().v % 16 != 0 || dispatchData.gws[1] % 16 != 0)) { + auto bs_fsv16_local = GetLimitedOptimalLocalWorkGroupSizes({dispatchData.gws[1], dispatchData.gws[2], dispatchData.gws[0]}, + params.engineInfo, {16, 32, params.engineInfo.maxWorkGroupSize}); + dispatchData.lws[0] = bs_fsv16_local[2]; + dispatchData.lws[1] = bs_fsv16_local[0]; + dispatchData.lws[2] = bs_fsv16_local[1]; } else { dispatchData.lws[0] = local[0]; dispatchData.lws[1] = local[1]; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl index 21ff35819d6..d3c8b435118 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl @@ -9,6 +9,14 @@ #define DT_OUTPUT_BLOCK_WRITEN(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE, ptr, offset, val) +#define OUTPUT_PACKED_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) + +#define TO_OUTPUT_PACKED_TYPE CAT(convert_, OUTPUT_PACKED_TYPE) + +#if defined(BIAS_TYPE_SIZE) && FILTER_TYPE_SIZE != BIAS_TYPE_SIZE +#error "convolution_gpu_bfyx_to_bfyx_f16: Filter and bias has different data type." +#endif + __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1))) KERNEL(convolution_bfyx_to_bfyx_f16)( @@ -89,7 +97,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( bias_offset += split_idx * BIAS_LENGTH; # endif - MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_INPUT_BLOCK_READ(biases, bias_offset)); + MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_BIAS_BLOCK_READ(biases, bias_offset)); #else MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = INPUT0_VAL_ZERO; #endif @@ -143,9 +151,9 @@ KERNEL(convolution_bfyx_to_bfyx_f16)( } } - MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) res; + OUTPUT_PACKED_TYPE res; #ifndef HAS_FUSED_OPS - res = ACTIVATION(dst, ACTIVATION_PARAMS); + res = TO_OUTPUT_PACKED_TYPE(ACTIVATION(dst, ACTIVATION_PARAMS)); #endif #if OUTPUT_LEFTOVERS