[GPU] Merge kernel updates (#7699)

+ fix a bug due to bias type mismatching - convolution_gpu_bfyx_to_bfyx_f16 + refactoring - depth_to_space_kernel_base - depth_to_space_kernel_ref + Adjusting LWS - eltwise_kernel_base
2021-09-29 17:10:53 +09:00 · 2021-09-29 17:10:53 +09:00 · 1e6fd56e01
commit 1e6fd56e01
parent ee93823b3a
6 changed files with 32 additions and 16 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
@ -27,18 +27,6 @@ bool DepthToSpaceKernelBase::Validate(const Params& p, const optional_params& o)
    return true;
 }
 CommonDispatchData DepthToSpaceKernelBase::SetDefault(const depth_to_space_params& params) const {
    CommonDispatchData dispatchData;
    dispatchData.gws = { params.output.Batch().v,
                         params.output.Feature().v,
                         params.output.Z().v * params.output.Y().v * params.output.X().v };
    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
    return dispatchData;
 }
 JitConstants DepthToSpaceKernelBase::GetJitConstants(const depth_to_space_params& params) const {
    JitConstants jit = MakeBaseParamsJitConstants(params);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h
@ -45,7 +45,7 @@ public:
 protected:
    bool Validate(const Params&, const optional_params&) const override;
    virtual JitConstants GetJitConstants(const depth_to_space_params& params) const;
-    virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const;
+    virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const = 0;
    KernelsData GetCommonKernelsData(const Params& params, const optional_params&) const;
 };
 }  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp
@ -28,6 +28,18 @@ ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const {
    return k;
 }
 CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params) const {
    CommonDispatchData dispatchData;
    dispatchData.gws = { params.output.Batch().v,
                         params.output.Feature().v,
                         params.output.Z().v * params.output.Y().v * params.output.X().v };
    // this kernel only supports bfyx and b_fs_yx_fsv16 layout.
    dispatchData.lws = GetOptimalLocalWorkGroupSizes({1, dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo);
    return dispatchData;
 }
 KernelsData DepthToSpaceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
    return GetCommonKernelsData(params, options);
 }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h
@ -14,6 +14,7 @@ public:
    DepthToSpaceKernelRef() : DepthToSpaceKernelBase("depth_to_space_ref") {}
    virtual ~DepthToSpaceKernelRef() {}
    CommonDispatchData SetDefault(const depth_to_space_params& params) const override;
    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
    ParamsKey GetSupportedKey() const override;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
@ -620,6 +620,13 @@ EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_para
            dispatchData.lws[1] = bs_fsv32_local[2];
            dispatchData.lws[2] = bs_fsv32_local[0];
        }
    } else if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
                (params.output.Feature().v % 16 != 0 || dispatchData.gws[1] % 16 != 0)) {
            auto bs_fsv16_local = GetLimitedOptimalLocalWorkGroupSizes({dispatchData.gws[1], dispatchData.gws[2], dispatchData.gws[0]},
                                                                        params.engineInfo, {16, 32, params.engineInfo.maxWorkGroupSize});
            dispatchData.lws[0] = bs_fsv16_local[2];
            dispatchData.lws[1] = bs_fsv16_local[0];
            dispatchData.lws[2] = bs_fsv16_local[1];
    } else {
        dispatchData.lws[0] = local[0];
        dispatchData.lws[1] = local[1];
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
@ -9,6 +9,14 @@
 #define DT_OUTPUT_BLOCK_WRITEN(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE, ptr, offset, val)
 #define OUTPUT_PACKED_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE)
 #define TO_OUTPUT_PACKED_TYPE CAT(convert_, OUTPUT_PACKED_TYPE)
 #if defined(BIAS_TYPE_SIZE) && FILTER_TYPE_SIZE != BIAS_TYPE_SIZE
 #error "convolution_gpu_bfyx_to_bfyx_f16: Filter and bias has different data type."
 #endif
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
 KERNEL(convolution_bfyx_to_bfyx_f16)(
@ -89,7 +97,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
    bias_offset += split_idx * BIAS_LENGTH;
 #   endif
-    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_INPUT_BLOCK_READ(biases, bias_offset));
+    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_BIAS_BLOCK_READ(biases, bias_offset));
 #else
    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = INPUT0_VAL_ZERO;
 #endif
@ -143,9 +151,9 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
        }
    }
-    MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) res;
+    OUTPUT_PACKED_TYPE res;
 #ifndef HAS_FUSED_OPS
-    res = ACTIVATION(dst, ACTIVATION_PARAMS);
+    res = TO_OUTPUT_PACKED_TYPE(ACTIVATION(dst, ACTIVATION_PARAMS));
 #endif
 #if OUTPUT_LEFTOVERS