[GPU] Added a couple of supported SIMD size checks (#7919)

This commit is contained in:
Vladimir Paramuzov 2021-10-13 12:22:08 +03:00 committed by GitHub
parent 82f8f19d11
commit efcdaac4a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 48 additions and 4 deletions

View File

@ -5,6 +5,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include <vector>
namespace cldnn { namespace cldnn {
/// @addtogroup cpp_api C++ API /// @addtogroup cpp_api C++ API
@ -54,6 +55,8 @@ struct device_info {
bool supports_usm; ///< Does engine support unified shared memory. bool supports_usm; ///< Does engine support unified shared memory.
std::vector<size_t> supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler
uint32_t vendor_id; ///< Vendor ID uint32_t vendor_id; ///< Vendor ID
std::string dev_name; ///< Device ID string std::string dev_name; ///< Device ID string
std::string driver_version; ///< Version of OpenCL driver std::string driver_version; ///< Version of OpenCL driver

View File

@ -93,6 +93,9 @@ bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_p
} }
const convolution_params& cp = static_cast<const convolution_params&>(p); const convolution_params& cp = static_cast<const convolution_params&>(p);
if (!IsSIMDSizeSupported(cp.engineInfo, 8))
return false;
if (cp.stride.x != 1 || cp.stride.y != 1) if (cp.stride.x != 1 || cp.stride.y != 1)
return false; return false;

View File

@ -98,6 +98,9 @@ bool ConvolutionKernel_bfyx_GEMMLike::Validate(const Params& p, const optional_p
const auto& params = static_cast<const convolution_params&>(p); const auto& params = static_cast<const convolution_params&>(p);
if (!IsSIMDSizeSupported(p.engineInfo, 8) && params.inputs[0].GetDType() == Datatype::F32)
return false;
if (!params.engineInfo.bSubGroupShortSupport && params.inputs[0].GetDType() == Datatype::F16) { if (!params.engineInfo.bSubGroupShortSupport && params.inputs[0].GetDType() == Datatype::F16) {
return false; return false;
} }
@ -118,4 +121,4 @@ KernelsData ConvolutionKernel_bfyx_GEMMLike::GetKernelsData(const Params& params
const optional_params& options) const { const optional_params& options) const {
return GetTunedKernelsDataByIndex(params, options); return GetTunedKernelsDataByIndex(params, options);
} }
} // namespace kernel_selector } // namespace kernel_selector

View File

@ -40,7 +40,7 @@ protected:
// Smaller # EU tends to be computation bounds. // Smaller # EU tends to be computation bounds.
// In such case, using larger worksize will result in larger computational inefficiency // In such case, using larger worksize will result in larger computational inefficiency
// w.r.t the unalined output feature // w.r.t the unalined output feature
return (params.output.Feature().v > 8) ? 16 : 8; return (params.output.Feature().v > 8 || !IsSIMDSizeSupported(params.engineInfo, 8)) ? 16 : 8;
} else { } else {
return 16; return 16;
} }

View File

@ -57,6 +57,9 @@ bool FullyConnected_bs_f_bsv8_af8::Validate(const Params& p, const optional_para
return false; return false;
} }
if (!IsSIMDSizeSupported(p.engineInfo, 8))
return false;
const auto& params = static_cast<const fully_connected_params&>(p); const auto& params = static_cast<const fully_connected_params&>(p);
const auto& optParams = static_cast<const fully_connected_optional_params&>(o); const auto& optParams = static_cast<const fully_connected_optional_params&>(o);

View File

@ -56,6 +56,9 @@ bool FullyConnected_fb_io_b8_f8::Validate(const Params& p, const optional_params
return false; return false;
} }
if (!IsSIMDSizeSupported(p.engineInfo, 8))
return false;
const auto& params = static_cast<const fully_connected_params&>(p); const auto& params = static_cast<const fully_connected_params&>(p);
const auto& output = params.output; const auto& output = params.output;

View File

@ -68,8 +68,8 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get
output_feature = output.Y().v; output_feature = output.Y().v;
} }
tuning_data.sub_group_size = 8; tuning_data.sub_group_size = IsSIMDSizeSupported(params.engineInfo, 8) ? 8 : 16;
if (input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 && if (tuning_data.sub_group_size == 8 && input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 &&
((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) { ((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) {
// Known cases for TGL where simd16 works better than simd8 // Known cases for TGL where simd16 works better than simd8
bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512; bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;

View File

@ -38,6 +38,9 @@ bool LRNKernelAcrossChannel_b8::Validate(const Params& p, const optional_params&
return false; return false;
} }
if (!IsSIMDSizeSupported(p.engineInfo, 8))
return false;
const lrn_params& params = static_cast<const lrn_params&>(p); const lrn_params& params = static_cast<const lrn_params&>(p);
const auto& out = params.output; const auto& out = params.output;

View File

@ -26,6 +26,9 @@ bool ReorderKernel_to_yxfb_batched::Validate(const Params& params, const optiona
return false; return false;
} }
if (!IsSIMDSizeSupported(params.engineInfo, 8))
return false;
const reorder_params& r_params = static_cast<const reorder_params&>(params); const reorder_params& r_params = static_cast<const reorder_params&>(params);
const auto& output = r_params.output; const auto& output = r_params.output;

View File

@ -98,6 +98,14 @@ JitConstants KernelBase::MakeBaseParamsJitConstants(const base_params& params) c
return jit; return jit;
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// IsSIMDSizeSupported
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
bool KernelBase::IsSIMDSizeSupported(const EngineInfo &info, size_t simd_size) const {
auto supported_sizes = info.supportedSimdSizes;
return std::find(supported_sizes.begin(), supported_sizes.end(), simd_size) != supported_sizes.end();
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// MakeBaseParamsJitConstants // MakeBaseParamsJitConstants
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -65,6 +65,7 @@ protected:
virtual Datatype GetUnitType(const base_params& params) const; virtual Datatype GetUnitType(const base_params& params) const;
bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const; bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const;
bool IsSIMDSizeSupported(const EngineInfo& info, size_t simd_size) const;
JitConstants MakeBaseParamsJitConstants(const base_params& params) const; JitConstants MakeBaseParamsJitConstants(const base_params& params) const;
virtual std::vector<FusedOpType> GetSupportedFusedOps() const; virtual std::vector<FusedOpType> GetSupportedFusedOps() const;
virtual JitConstants MakeFusedOpsJitConstants(const base_params &params, const std::vector<FusedOpsConfiguration> &conf) const; virtual JitConstants MakeFusedOpsJitConstants(const base_params &params, const std::vector<FusedOpsConfiguration> &conf) const;

View File

@ -384,6 +384,7 @@ struct EngineInfo {
uint64_t maxImage2dHeight = 0; uint64_t maxImage2dHeight = 0;
std::string deviceId = ""; std::string deviceId = "";
std::string driverVersion = ""; std::string driverVersion = "";
std::vector<size_t> supportedSimdSizes = {};
std::shared_ptr<TuningCache> deviceCache; std::shared_ptr<TuningCache> deviceCache;
}; };

View File

@ -233,6 +233,14 @@ device_info init_device_info(const cl::Device& device) {
info.supports_queue_families = extensions.find("cl_intel_command_queue_families") != std::string::npos; info.supports_queue_families = extensions.find("cl_intel_command_queue_families") != std::string::npos;
bool sub_group_sizes_supported = extensions.find("cl_intel_required_subgroup_size") != std::string::npos;
if (sub_group_sizes_supported) {
info.supported_simd_sizes = device.getInfo<CL_DEVICE_SUB_GROUP_SIZES_INTEL>();
} else {
// Set these values as reasonable default for most of the supported platforms
info.supported_simd_sizes = {8, 16, 32};
}
bool device_attr_supported = extensions.find("cl_intel_device_attribute_query") != std::string::npos; bool device_attr_supported = extensions.find("cl_intel_device_attribute_query") != std::string::npos;
if (device_attr_supported) { if (device_attr_supported) {

View File

@ -19,6 +19,9 @@ typedef cl_va_api_device_source_intel cl_device_source_intel;
typedef cl_va_api_device_set_intel cl_device_set_intel; typedef cl_va_api_device_set_intel cl_device_set_intel;
#endif #endif
// cl_intel_required_subgroup_size
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
// cl_intel_device_attribute_query // cl_intel_device_attribute_query
#define CL_DEVICE_IP_VERSION_INTEL 0x4250 #define CL_DEVICE_IP_VERSION_INTEL 0x4250
#define CL_DEVICE_ID_INTEL 0x4251 #define CL_DEVICE_ID_INTEL 0x4251
@ -43,6 +46,7 @@ typedef cl_bitfield cl_device_feature_capabilities_intel;
namespace cl { namespace cl {
namespace detail { namespace detail {
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUB_GROUP_SIZES_INTEL, vector<size_type>)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint) CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint) CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint) CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint)

View File

@ -836,6 +836,7 @@ void set_params(const program_node& node, kernel_selector::params& params) {
params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count; params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count;
params.engineInfo.deviceCache = program.get_tuning_cache(); params.engineInfo.deviceCache = program.get_tuning_cache();
params.engineInfo.driverVersion = device_info.driver_version; params.engineInfo.driverVersion = device_info.driver_version;
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
auto impl_forcing_bo = program.get_options().get<build_option_type::force_implementations>(); auto impl_forcing_bo = program.get_options().get<build_option_type::force_implementations>();
const auto& impl_forcing = impl_forcing_bo->forcing; const auto& impl_forcing = impl_forcing_bo->forcing;