[GPU] Added a couple of supported SIMD size checks (#7919)

2021-10-13 12:22:08 +03:00 · 2021-10-13 12:22:08 +03:00 · efcdaac4a6
commit efcdaac4a6
parent 82f8f19d11
15 changed files with 48 additions and 4 deletions
--- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/device_info.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/device_info.hpp
@ -5,6 +5,7 @@
 #pragma once

 #include <string>
+#include <vector>

 namespace cldnn {
 /// @addtogroup cpp_api C++ API
@ -54,6 +55,8 @@ struct device_info {

    bool supports_usm;                          ///< Does engine support unified shared memory.

+    std::vector<size_t> supported_simd_sizes;   ///< List of SIMD sizes supported by current device and compiler
+
    uint32_t vendor_id;                         ///< Vendor ID
    std::string dev_name;                       ///< Device ID string
    std::string driver_version;                 ///< Version of OpenCL driver
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp
@ -93,6 +93,9 @@ bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_p
    }
    const convolution_params& cp = static_cast<const convolution_params&>(p);

+    if (!IsSIMDSizeSupported(cp.engineInfo, 8))
+        return false;
+
    if (cp.stride.x != 1 || cp.stride.y != 1)
        return false;

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp
@ -98,6 +98,9 @@ bool ConvolutionKernel_bfyx_GEMMLike::Validate(const Params& p, const optional_p

    const auto& params = static_cast<const convolution_params&>(p);

+    if (!IsSIMDSizeSupported(p.engineInfo, 8) && params.inputs[0].GetDType() == Datatype::F32)
+        return false;
+
    if (!params.engineInfo.bSubGroupShortSupport && params.inputs[0].GetDType() == Datatype::F16) {
        return false;
    }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h
@ -40,7 +40,7 @@ protected:
            // Smaller # EU tends to be computation bounds.
            // In such case, using larger worksize will result in larger computational inefficiency
            // w.r.t the unalined output feature
-            return (params.output.Feature().v > 8) ? 16 : 8;
+            return (params.output.Feature().v > 8 || !IsSIMDSizeSupported(params.engineInfo, 8)) ? 16 : 8;
        } else {
            return 16;
        }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp
@ -57,6 +57,9 @@ bool FullyConnected_bs_f_bsv8_af8::Validate(const Params& p, const optional_para
        return false;
    }

+    if (!IsSIMDSizeSupported(p.engineInfo, 8))
+        return false;
+
    const auto& params = static_cast<const fully_connected_params&>(p);
    const auto& optParams = static_cast<const fully_connected_optional_params&>(o);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp
@ -56,6 +56,9 @@ bool FullyConnected_fb_io_b8_f8::Validate(const Params& p, const optional_params
        return false;
    }

+    if (!IsSIMDSizeSupported(p.engineInfo, 8))
+        return false;
+
    const auto& params = static_cast<const fully_connected_params&>(p);

    const auto& output = params.output;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
@ -68,8 +68,8 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get
        output_feature = output.Y().v;
    }

-    tuning_data.sub_group_size = 8;
-    if (input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 &&
+    tuning_data.sub_group_size = IsSIMDSizeSupported(params.engineInfo, 8) ? 8 : 16;
+    if (tuning_data.sub_group_size == 8 && input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 &&
        ((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) {
        // Known cases for TGL where simd16 works better than simd8
        bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.cpp
@ -38,6 +38,9 @@ bool LRNKernelAcrossChannel_b8::Validate(const Params& p, const optional_params&
        return false;
    }

+    if (!IsSIMDSizeSupported(p.engineInfo, 8))
+        return false;
+
    const lrn_params& params = static_cast<const lrn_params&>(p);
    const auto& out = params.output;

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.cpp
@ -26,6 +26,9 @@ bool ReorderKernel_to_yxfb_batched::Validate(const Params& params, const optiona
        return false;
    }

+    if (!IsSIMDSizeSupported(params.engineInfo, 8))
+        return false;
+
    const reorder_params& r_params = static_cast<const reorder_params&>(params);

    const auto& output = r_params.output;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
@ -98,6 +98,14 @@ JitConstants KernelBase::MakeBaseParamsJitConstants(const base_params& params) c
    return jit;
 }

+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// IsSIMDSizeSupported
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool KernelBase::IsSIMDSizeSupported(const EngineInfo &info, size_t simd_size) const {
+    auto supported_sizes = info.supportedSimdSizes;
+    return std::find(supported_sizes.begin(), supported_sizes.end(), simd_size) != supported_sizes.end();
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // MakeBaseParamsJitConstants
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h
@ -65,6 +65,7 @@ protected:
    virtual Datatype GetUnitType(const base_params& params) const;

    bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const;
+    bool IsSIMDSizeSupported(const EngineInfo& info, size_t simd_size) const;
    JitConstants MakeBaseParamsJitConstants(const base_params& params) const;
    virtual std::vector<FusedOpType> GetSupportedFusedOps() const;
    virtual JitConstants MakeFusedOpsJitConstants(const base_params &params, const std::vector<FusedOpsConfiguration> &conf) const;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
@ -384,6 +384,7 @@ struct EngineInfo {
    uint64_t maxImage2dHeight = 0;
    std::string deviceId = "";
    std::string driverVersion = "";
+    std::vector<size_t> supportedSimdSizes = {};
    std::shared_ptr<TuningCache> deviceCache;
 };

--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp
@ -233,6 +233,14 @@ device_info init_device_info(const cl::Device& device) {

    info.supports_queue_families = extensions.find("cl_intel_command_queue_families") != std::string::npos;

+    bool sub_group_sizes_supported = extensions.find("cl_intel_required_subgroup_size") != std::string::npos;
+    if (sub_group_sizes_supported) {
+        info.supported_simd_sizes = device.getInfo<CL_DEVICE_SUB_GROUP_SIZES_INTEL>();
+    } else {
+        // Set these values as reasonable default for most of the supported platforms
+        info.supported_simd_sizes = {8, 16, 32};
+    }
+
    bool device_attr_supported = extensions.find("cl_intel_device_attribute_query") != std::string::npos;

    if (device_attr_supported) {
--- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
+++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp
@ -19,6 +19,9 @@ typedef cl_va_api_device_source_intel cl_device_source_intel;
 typedef cl_va_api_device_set_intel    cl_device_set_intel;
 #endif

+// cl_intel_required_subgroup_size
+#define CL_DEVICE_SUB_GROUP_SIZES_INTEL           0x4108
+
 // cl_intel_device_attribute_query
 #define CL_DEVICE_IP_VERSION_INTEL                0x4250
 #define CL_DEVICE_ID_INTEL                        0x4251
@ -43,6 +46,7 @@ typedef cl_bitfield         cl_device_feature_capabilities_intel;

 namespace cl {
 namespace detail {
+CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUB_GROUP_SIZES_INTEL, vector<size_type>)
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint)
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint)
 CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint)
--- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
+++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
@ -836,6 +836,7 @@ void set_params(const program_node& node, kernel_selector::params& params) {
    params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count;
    params.engineInfo.deviceCache = program.get_tuning_cache();
    params.engineInfo.driverVersion = device_info.driver_version;
+    params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;

    auto impl_forcing_bo = program.get_options().get<build_option_type::force_implementations>();
    const auto& impl_forcing = impl_forcing_bo->forcing;