[GPU] Added a couple of supported SIMD size checks (#7919)
This commit is contained in:
parent
82f8f19d11
commit
efcdaac4a6
@ -5,6 +5,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace cldnn {
|
||||
/// @addtogroup cpp_api C++ API
|
||||
@ -54,6 +55,8 @@ struct device_info {
|
||||
|
||||
bool supports_usm; ///< Does engine support unified shared memory.
|
||||
|
||||
std::vector<size_t> supported_simd_sizes; ///< List of SIMD sizes supported by current device and compiler
|
||||
|
||||
uint32_t vendor_id; ///< Vendor ID
|
||||
std::string dev_name; ///< Device ID string
|
||||
std::string driver_version; ///< Version of OpenCL driver
|
||||
|
@ -93,6 +93,9 @@ bool convolution_kernel_bfyx_1x1_opt::Validate(const Params& p, const optional_p
|
||||
}
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
if (!IsSIMDSizeSupported(cp.engineInfo, 8))
|
||||
return false;
|
||||
|
||||
if (cp.stride.x != 1 || cp.stride.y != 1)
|
||||
return false;
|
||||
|
||||
|
@ -98,6 +98,9 @@ bool ConvolutionKernel_bfyx_GEMMLike::Validate(const Params& p, const optional_p
|
||||
|
||||
const auto& params = static_cast<const convolution_params&>(p);
|
||||
|
||||
if (!IsSIMDSizeSupported(p.engineInfo, 8) && params.inputs[0].GetDType() == Datatype::F32)
|
||||
return false;
|
||||
|
||||
if (!params.engineInfo.bSubGroupShortSupport && params.inputs[0].GetDType() == Datatype::F16) {
|
||||
return false;
|
||||
}
|
||||
|
@ -40,7 +40,7 @@ protected:
|
||||
// Smaller # EU tends to be computation bounds.
|
||||
// In such case, using larger worksize will result in larger computational inefficiency
|
||||
// w.r.t the unalined output feature
|
||||
return (params.output.Feature().v > 8) ? 16 : 8;
|
||||
return (params.output.Feature().v > 8 || !IsSIMDSizeSupported(params.engineInfo, 8)) ? 16 : 8;
|
||||
} else {
|
||||
return 16;
|
||||
}
|
||||
|
@ -57,6 +57,9 @@ bool FullyConnected_bs_f_bsv8_af8::Validate(const Params& p, const optional_para
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!IsSIMDSizeSupported(p.engineInfo, 8))
|
||||
return false;
|
||||
|
||||
const auto& params = static_cast<const fully_connected_params&>(p);
|
||||
const auto& optParams = static_cast<const fully_connected_optional_params&>(o);
|
||||
|
||||
|
@ -56,6 +56,9 @@ bool FullyConnected_fb_io_b8_f8::Validate(const Params& p, const optional_params
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!IsSIMDSizeSupported(p.engineInfo, 8))
|
||||
return false;
|
||||
|
||||
const auto& params = static_cast<const fully_connected_params&>(p);
|
||||
|
||||
const auto& output = params.output;
|
||||
|
@ -68,8 +68,8 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get
|
||||
output_feature = output.Y().v;
|
||||
}
|
||||
|
||||
tuning_data.sub_group_size = 8;
|
||||
if (input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 &&
|
||||
tuning_data.sub_group_size = IsSIMDSizeSupported(params.engineInfo, 8) ? 8 : 16;
|
||||
if (tuning_data.sub_group_size == 8 && input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 &&
|
||||
((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) {
|
||||
// Known cases for TGL where simd16 works better than simd8
|
||||
bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;
|
||||
|
@ -38,6 +38,9 @@ bool LRNKernelAcrossChannel_b8::Validate(const Params& p, const optional_params&
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!IsSIMDSizeSupported(p.engineInfo, 8))
|
||||
return false;
|
||||
|
||||
const lrn_params& params = static_cast<const lrn_params&>(p);
|
||||
const auto& out = params.output;
|
||||
|
||||
|
@ -26,6 +26,9 @@ bool ReorderKernel_to_yxfb_batched::Validate(const Params& params, const optiona
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!IsSIMDSizeSupported(params.engineInfo, 8))
|
||||
return false;
|
||||
|
||||
const reorder_params& r_params = static_cast<const reorder_params&>(params);
|
||||
|
||||
const auto& output = r_params.output;
|
||||
|
@ -98,6 +98,14 @@ JitConstants KernelBase::MakeBaseParamsJitConstants(const base_params& params) c
|
||||
return jit;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// IsSIMDSizeSupported
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
bool KernelBase::IsSIMDSizeSupported(const EngineInfo &info, size_t simd_size) const {
|
||||
auto supported_sizes = info.supportedSimdSizes;
|
||||
return std::find(supported_sizes.begin(), supported_sizes.end(), simd_size) != supported_sizes.end();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// MakeBaseParamsJitConstants
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -65,6 +65,7 @@ protected:
|
||||
virtual Datatype GetUnitType(const base_params& params) const;
|
||||
|
||||
bool IsFusedPrimitiveSupported(const fused_operation_desc& fused_op) const;
|
||||
bool IsSIMDSizeSupported(const EngineInfo& info, size_t simd_size) const;
|
||||
JitConstants MakeBaseParamsJitConstants(const base_params& params) const;
|
||||
virtual std::vector<FusedOpType> GetSupportedFusedOps() const;
|
||||
virtual JitConstants MakeFusedOpsJitConstants(const base_params ¶ms, const std::vector<FusedOpsConfiguration> &conf) const;
|
||||
|
@ -384,6 +384,7 @@ struct EngineInfo {
|
||||
uint64_t maxImage2dHeight = 0;
|
||||
std::string deviceId = "";
|
||||
std::string driverVersion = "";
|
||||
std::vector<size_t> supportedSimdSizes = {};
|
||||
std::shared_ptr<TuningCache> deviceCache;
|
||||
};
|
||||
|
||||
|
@ -233,6 +233,14 @@ device_info init_device_info(const cl::Device& device) {
|
||||
|
||||
info.supports_queue_families = extensions.find("cl_intel_command_queue_families") != std::string::npos;
|
||||
|
||||
bool sub_group_sizes_supported = extensions.find("cl_intel_required_subgroup_size") != std::string::npos;
|
||||
if (sub_group_sizes_supported) {
|
||||
info.supported_simd_sizes = device.getInfo<CL_DEVICE_SUB_GROUP_SIZES_INTEL>();
|
||||
} else {
|
||||
// Set these values as reasonable default for most of the supported platforms
|
||||
info.supported_simd_sizes = {8, 16, 32};
|
||||
}
|
||||
|
||||
bool device_attr_supported = extensions.find("cl_intel_device_attribute_query") != std::string::npos;
|
||||
|
||||
if (device_attr_supported) {
|
||||
|
@ -19,6 +19,9 @@ typedef cl_va_api_device_source_intel cl_device_source_intel;
|
||||
typedef cl_va_api_device_set_intel cl_device_set_intel;
|
||||
#endif
|
||||
|
||||
// cl_intel_required_subgroup_size
|
||||
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
|
||||
|
||||
// cl_intel_device_attribute_query
|
||||
#define CL_DEVICE_IP_VERSION_INTEL 0x4250
|
||||
#define CL_DEVICE_ID_INTEL 0x4251
|
||||
@ -43,6 +46,7 @@ typedef cl_bitfield cl_device_feature_capabilities_intel;
|
||||
|
||||
namespace cl {
|
||||
namespace detail {
|
||||
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUB_GROUP_SIZES_INTEL, vector<size_type>)
|
||||
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint)
|
||||
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint)
|
||||
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint)
|
||||
|
@ -836,6 +836,7 @@ void set_params(const program_node& node, kernel_selector::params& params) {
|
||||
params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count;
|
||||
params.engineInfo.deviceCache = program.get_tuning_cache();
|
||||
params.engineInfo.driverVersion = device_info.driver_version;
|
||||
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
|
||||
|
||||
auto impl_forcing_bo = program.get_options().get<build_option_type::force_implementations>();
|
||||
const auto& impl_forcing = impl_forcing_bo->forcing;
|
||||
|
Loading…
Reference in New Issue
Block a user