diff --git a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/device_info.hpp b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/device_info.hpp index d0c90088305..f1398341304 100644 --- a/inference-engine/thirdparty/clDNN/api/cldnn/runtime/device_info.hpp +++ b/inference-engine/thirdparty/clDNN/api/cldnn/runtime/device_info.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include namespace cldnn { /// @addtogroup cpp_api C++ API @@ -54,6 +55,8 @@ struct device_info { bool supports_usm; ///< Does engine support unified shared memory. + std::vector supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler + uint32_t vendor_id; ///< Vendor ID std::string dev_name; ///< Device ID string std::string driver_version; ///< Version of OpenCL driver diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp index e46c2189521..b76582f8683 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_1x1_opt.cpp @@ -93,6 +93,9 @@ bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_p } const convolution_params& cp = static_cast(p); + if (!IsSIMDSizeSupported(cp.engineInfo, 8)) + return false; + if (cp.stride.x != 1 || cp.stride.y != 1) return false; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp index 62fd322cb2f..abd7298d6fa 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp @@ -98,6 +98,9 @@ bool ConvolutionKernel_bfyx_GEMMLike::Validate(const Params& p, const optional_p const auto& params = static_cast(p); + if (!IsSIMDSizeSupported(p.engineInfo, 8) && params.inputs[0].GetDType() == Datatype::F32) + return false; + if (!params.engineInfo.bSubGroupShortSupport && params.inputs[0].GetDType() == Datatype::F16) { return false; } @@ -118,4 +121,4 @@ KernelsData ConvolutionKernel_bfyx_GEMMLike::GetKernelsData(const Params& params const optional_params& options) const { return GetTunedKernelsDataByIndex(params, options); } -} // namespace kernel_selector \ No newline at end of file +} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h index 9da52609636..aa0f0610626 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.h @@ -40,7 +40,7 @@ protected: // Smaller # EU tends to be computation bounds. // In such case, using larger worksize will result in larger computational inefficiency // w.r.t the unalined output feature - return (params.output.Feature().v > 8) ? 16 : 8; + return (params.output.Feature().v > 8 || !IsSIMDSizeSupported(params.engineInfo, 8)) ? 16 : 8; } else { return 16; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp index 8ca26cff653..551c6650702 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bs_f_bsv8_af8.cpp @@ -57,6 +57,9 @@ bool FullyConnected_bs_f_bsv8_af8::Validate(const Params& p, const optional_para return false; } + if (!IsSIMDSizeSupported(p.engineInfo, 8)) + return false; + const auto& params = static_cast(p); const auto& optParams = static_cast(o); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp index 7fada83beed..94a1c0c371d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_fb_io_b8_f8.cpp @@ -56,6 +56,9 @@ bool FullyConnected_fb_io_b8_f8::Validate(const Params& p, const optional_params return false; } + if (!IsSIMDSizeSupported(p.engineInfo, 8)) + return false; + const auto& params = static_cast(p); const auto& output = params.output; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp index f3d020c015e..ad717d00891 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp @@ -68,8 +68,8 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get output_feature = output.Y().v; } - tuning_data.sub_group_size = 8; - if (input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 && + tuning_data.sub_group_size = IsSIMDSizeSupported(params.engineInfo, 8) ? 8 : 16; + if (tuning_data.sub_group_size == 8 && input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 && ((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) { // Known cases for TGL where simd16 works better than simd8 bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.cpp index a624e11e82e..637fbc50cf0 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lrn/lrn_kernel_across_channel_opt_b8.cpp @@ -38,6 +38,9 @@ bool LRNKernelAcrossChannel_b8::Validate(const Params& p, const optional_params& return false; } + if (!IsSIMDSizeSupported(p.engineInfo, 8)) + return false; + const lrn_params& params = static_cast(p); const auto& out = params.output; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.cpp index 40e5479341e..f149dd11bf6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_to_yxfb_batched.cpp @@ -26,6 +26,9 @@ bool ReorderKernel_to_yxfb_batched::Validate(const Params& params, const optiona return false; } + if (!IsSIMDSizeSupported(params.engineInfo, 8)) + return false; + const reorder_params& r_params = static_cast(params); const auto& output = r_params.output; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp index 7b977147465..d44c8d07553 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp @@ -98,6 +98,14 @@ JitConstants KernelBase::MakeBaseParamsJitConstants(const base_params& params) c return jit; } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// IsSIMDSizeSupported +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +bool KernelBase::IsSIMDSizeSupported(const EngineInfo &info, size_t simd_size) const { + auto supported_sizes = info.supportedSimdSizes; + return std::find(supported_sizes.begin(), supported_sizes.end(), simd_size) != supported_sizes.end(); +} + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // MakeBaseParamsJitConstants //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h index 1c9335c1a56..c5d996104e5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.h @@ -65,6 +65,7 @@ protected: virtual Datatype GetUnitType(const base_params& params) const; bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const; + bool IsSIMDSizeSupported(const EngineInfo& info, size_t simd_size) const; JitConstants MakeBaseParamsJitConstants(const base_params& params) const; virtual std::vector GetSupportedFusedOps() const; virtual JitConstants MakeFusedOpsJitConstants(const base_params ¶ms, const std::vector &conf) const; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h index b68054dad7c..b45139d8057 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h @@ -384,6 +384,7 @@ struct EngineInfo { uint64_t maxImage2dHeight = 0; std::string deviceId = ""; std::string driverVersion = ""; + std::vector supportedSimdSizes = {}; std::shared_ptr deviceCache; }; diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp index 5a5a36919e0..821d790c822 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_device.cpp @@ -233,6 +233,14 @@ device_info init_device_info(const cl::Device& device) { info.supports_queue_families = extensions.find("cl_intel_command_queue_families") != std::string::npos; + bool sub_group_sizes_supported = extensions.find("cl_intel_required_subgroup_size") != std::string::npos; + if (sub_group_sizes_supported) { + info.supported_simd_sizes = device.getInfo(); + } else { + // Set these values as reasonable default for most of the supported platforms + info.supported_simd_sizes = {8, 16, 32}; + } + bool device_attr_supported = extensions.find("cl_intel_device_attribute_query") != std::string::npos; if (device_attr_supported) { diff --git a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp index fc10e713d8e..34686cc67a9 100644 --- a/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp +++ b/inference-engine/thirdparty/clDNN/runtime/ocl/ocl_ext.hpp @@ -19,6 +19,9 @@ typedef cl_va_api_device_source_intel cl_device_source_intel; typedef cl_va_api_device_set_intel cl_device_set_intel; #endif +// cl_intel_required_subgroup_size +#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 + // cl_intel_device_attribute_query #define CL_DEVICE_IP_VERSION_INTEL 0x4250 #define CL_DEVICE_ID_INTEL 0x4251 @@ -43,6 +46,7 @@ typedef cl_bitfield cl_device_feature_capabilities_intel; namespace cl { namespace detail { +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUB_GROUP_SIZES_INTEL, vector) CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint) CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint) CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint) diff --git a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp index 4c7cd17da9c..09c981cf1d3 100644 --- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp +++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp @@ -836,6 +836,7 @@ void set_params(const program_node& node, kernel_selector::params& params) { params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count; params.engineInfo.deviceCache = program.get_tuning_cache(); params.engineInfo.driverVersion = device_info.driver_version; + params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes; auto impl_forcing_bo = program.get_options().get(); const auto& impl_forcing = impl_forcing_bo->forcing;