[GPU] Merge kernel updates (#7699)

+ fix a bug due to bias type mismatching - convolution_gpu_bfyx_to_bfyx_f16 + refactoring - depth_to_space_kernel_base - depth_to_space_kernel_ref + Adjusting LWS - eltwise_kernel_base
2021-09-29 17:10:53 +09:00 · 2021-09-29 17:10:53 +09:00 · 1e6fd56e01
commit 1e6fd56e01
parent ee93823b3a
6 changed files with 32 additions and 16 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
@ -27,18 +27,6 @@ bool DepthToSpaceKernelBase::Validate(const Params& p, const optional_params& o)
    return true;
 }

-CommonDispatchData DepthToSpaceKernelBase::SetDefault(const depth_to_space_params& params) const {
-    CommonDispatchData dispatchData;
-
-    dispatchData.gws = { params.output.Batch().v,
-                         params.output.Feature().v,
-                         params.output.Z().v * params.output.Y().v * params.output.X().v };
-
-    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
-
-    return dispatchData;
-}
-
 JitConstants DepthToSpaceKernelBase::GetJitConstants(const depth_to_space_params& params) const {
    JitConstants jit = MakeBaseParamsJitConstants(params);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h
@ -45,7 +45,7 @@ public:
 protected:
    bool Validate(const Params&, const optional_params&) const override;
    virtual JitConstants GetJitConstants(const depth_to_space_params& params) const;
-    virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const;
+    virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const = 0;
    KernelsData GetCommonKernelsData(const Params& params, const optional_params&) const;
 };
 }  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp
@ -28,6 +28,18 @@ ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const {
    return k;
 }

+CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params) const {
+    CommonDispatchData dispatchData;
+
+    dispatchData.gws = { params.output.Batch().v,
+                         params.output.Feature().v,
+                         params.output.Z().v * params.output.Y().v * params.output.X().v };
+
+    // this kernel only supports bfyx and b_fs_yx_fsv16 layout.
+    dispatchData.lws = GetOptimalLocalWorkGroupSizes({1, dispatchData.gws[1], dispatchData.gws[2]}, params.engineInfo);
+    return dispatchData;
+}
+
 KernelsData DepthToSpaceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
    return GetCommonKernelsData(params, options);
 }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h
@ -14,6 +14,7 @@ public:
    DepthToSpaceKernelRef() : DepthToSpaceKernelBase("depth_to_space_ref") {}
    virtual ~DepthToSpaceKernelRef() {}

+    CommonDispatchData SetDefault(const depth_to_space_params& params) const override;
    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
    ParamsKey GetSupportedKey() const override;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
@ -620,6 +620,13 @@ EltwiseKernelBase::DispatchData EltwiseKernelBase::SetDefault(const eltwise_para
            dispatchData.lws[1] = bs_fsv32_local[2];
            dispatchData.lws[2] = bs_fsv32_local[0];
        }
+    } else if (params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
+                (params.output.Feature().v % 16 != 0 || dispatchData.gws[1] % 16 != 0)) {
+            auto bs_fsv16_local = GetLimitedOptimalLocalWorkGroupSizes({dispatchData.gws[1], dispatchData.gws[2], dispatchData.gws[0]},
+                                                                        params.engineInfo, {16, 32, params.engineInfo.maxWorkGroupSize});
+            dispatchData.lws[0] = bs_fsv16_local[2];
+            dispatchData.lws[1] = bs_fsv16_local[0];
+            dispatchData.lws[2] = bs_fsv16_local[1];
    } else {
        dispatchData.lws[0] = local[0];
        dispatchData.lws[1] = local[1];
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
@ -9,6 +9,14 @@

 #define DT_OUTPUT_BLOCK_WRITEN(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE, ptr, offset, val)

+#define OUTPUT_PACKED_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE)
+
+#define TO_OUTPUT_PACKED_TYPE CAT(convert_, OUTPUT_PACKED_TYPE)
+
+#if defined(BIAS_TYPE_SIZE) && FILTER_TYPE_SIZE != BIAS_TYPE_SIZE
+#error "convolution_gpu_bfyx_to_bfyx_f16: Filter and bias has different data type."
+#endif
+
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
 KERNEL(convolution_bfyx_to_bfyx_f16)(
@ -89,7 +97,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
    bias_offset += split_idx * BIAS_LENGTH;
 #   endif

-    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_INPUT_BLOCK_READ(biases, bias_offset));
+    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_BIAS_BLOCK_READ(biases, bias_offset));
 #else
    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = INPUT0_VAL_ZERO;
 #endif
@ -143,9 +151,9 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
        }
    }

-    MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) res;
+    OUTPUT_PACKED_TYPE res;
 #ifndef HAS_FUSED_OPS
-    res = ACTIVATION(dst, ACTIVATION_PARAMS);
+    res = TO_OUTPUT_PACKED_TYPE(ACTIVATION(dst, ACTIVATION_PARAMS));
 #endif

 #if OUTPUT_LEFTOVERS