[IE CLDNN] Add two early optimization capabilites (#1155)

This change adds checks, macros and defines for two early/experimental
features:
- local memory block reads
- builtin optimization hints, ie: __builtin_assume
This commit is contained in:
Konrad Dobros 2020-06-30 17:29:34 +02:00 committed by GitHub
parent b8b8a21dc7
commit 66f620f97e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 146 additions and 13 deletions

View File

@ -29,3 +29,9 @@
#define OFFSET_GLOBAL_PTR(elem_type, ptr, byte_offset) ((__global elem_type*)((__global char*)(ptr) + (byte_offset)))
#define MULTIPLY_OFFSET(elem_type, byte_offset) ((byte_offset) * sizeof(elem_type))
#if OPT_HINTS_SUPPORTED
# define ASSUME_HINT(x) __builtin_assume(x)
#else
# define ASSUME_HINT(x) do { } while (0)
#endif

View File

@ -58,6 +58,11 @@
// For more details and description of intel_sub_group_block_read/write functions please,
// refer to cl_intel_subgroups extension documentation.
//
// BLOCK_READN_SLM(type, vector_size, ptr, offset)
// - performs same operation as BLOCK_READN, but with "ptr" being in __local address space.
// BLOCK_WRITEN_SLM(type, vector_size, ptr, offset, val)
// - performs same operation as BLOCK_READN, but with "ptr" being in __local address space.
//
// type [PP] - Must evaluate to non-vectorized type, ex. float, half, char, etc..
// vector_size [PP] - Number of elements to read/write, ex 2 for intel_sub_group_block_read2.
// ptr - Pointer to global memory where to read from/write to.
@ -101,17 +106,22 @@
#define BLOCK_WRITEN_FUNC_size4(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size)
#define BLOCK_WRITEN_FUNC(type_size, vector_size) CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)
#define BLOCK_READN_RAW(type_size, vector_size, ptr, offset) \
BLOCK_READN_FUNC(type_size, vector_size)((const __global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset))
#define BLOCK_WRITEN_RAW(type_size, vector_size, ptr, offset, val) \
#define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset) \
BLOCK_READN_FUNC(type_size, vector_size)((const addr_space BLOCK_RW_TYPE(type_size)*)(ptr) + (offset))
#define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val) \
BLOCK_WRITEN_FUNC(type_size, vector_size)( \
(__global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset), \
(addr_space BLOCK_RW_TYPE(type_size)*)(ptr) + (offset), \
AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_RW_TYPE(type_size), vector_size), val))
#define BLOCK_READN(type, vector_size, ptr, offset) \
AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, ptr, offset))
AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, __global, ptr, offset))
#define BLOCK_WRITEN(type, vector_size, ptr, offset, val) \
BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, ptr, offset, val)
BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, __global, ptr, offset, val)
#define BLOCK_READN_SLM(type, vector_size, ptr, offset) \
AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, __local, ptr, offset))
#define BLOCK_WRITEN_SLM(type, vector_size, ptr, offset, val) \
BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, __local, ptr, offset, val)
#define DT_INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset)
#define DT_INPUT_BLOCK_READ2(ptr, offset) BLOCK_READN(INPUT0_TYPE, 2, ptr, offset)

View File

@ -521,7 +521,7 @@ inline void FUNC(sub_group_block_write_uchar16)(__global uchar* outPtr, uchar16
#endif
}
inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr)
inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr) __attribute__((overloadable))
{
#ifdef cl_intel_subgroups_char
// WA for compiler support
@ -553,6 +553,38 @@ inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr)
#endif
}
inline uchar16 FUNC(sub_group_block_read_uchar16)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
// WA for compiler support
// return intel_sub_group_block_read_uc16(ptr);
return (uchar16)(intel_sub_group_block_read_uc8(ptr), intel_sub_group_block_read_uc8(ptr + 8 * get_max_sub_group_size()));
#else
uint idx = get_sub_group_local_id();
uchar16 ret;
ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
ret.s4 = ptr[idx]; idx += get_max_sub_group_size();
ret.s5 = ptr[idx]; idx += get_max_sub_group_size();
ret.s6 = ptr[idx]; idx += get_max_sub_group_size();
ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
ret.s8 = ptr[idx]; idx += get_max_sub_group_size();
ret.s9 = ptr[idx]; idx += get_max_sub_group_size();
ret.sa = ptr[idx]; idx += get_max_sub_group_size();
ret.sb = ptr[idx]; idx += get_max_sub_group_size();
ret.sc = ptr[idx]; idx += get_max_sub_group_size();
ret.sd = ptr[idx]; idx += get_max_sub_group_size();
ret.se = ptr[idx]; idx += get_max_sub_group_size();
ret.sf = ptr[idx]; idx += get_max_sub_group_size();
return ret;
#endif
}
inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v)
{
#ifdef cl_intel_subgroups_char
@ -571,7 +603,7 @@ inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v)
#endif
}
inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr)
inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) __attribute__((overloadable))
{
#ifdef cl_intel_subgroups_char
return intel_sub_group_block_read_uc8(ptr);
@ -590,7 +622,28 @@ inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr)
ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
return ret;
#endif
}
inline uchar8 FUNC(sub_group_block_read_uchar8)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc8(ptr);
#else
uint idx = get_sub_group_local_id();
uchar8 ret;
ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
ret.s4 = ptr[idx]; idx += get_max_sub_group_size();
ret.s5 = ptr[idx]; idx += get_max_sub_group_size();
ret.s6 = ptr[idx]; idx += get_max_sub_group_size();
ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
return ret;
#endif
}
@ -608,7 +661,7 @@ inline void FUNC(sub_group_block_write_uchar4)(__global uchar* outPtr, uchar4 v)
#endif
}
inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr)
inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr) __attribute__((overloadable))
{
#ifdef cl_intel_subgroups_char
return intel_sub_group_block_read_uc4(ptr);
@ -626,6 +679,24 @@ inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr)
#endif
}
inline uchar4 FUNC(sub_group_block_read_uchar4)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc4(ptr);
#else
uint idx = get_sub_group_local_id();
uchar4 ret;
ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
return ret;
#endif
}
inline void FUNC(sub_group_block_write_uchar2)(__global uchar* outPtr, uchar2 v)
{
#ifdef cl_intel_subgroups_char
@ -638,7 +709,7 @@ inline void FUNC(sub_group_block_write_uchar2)(__global uchar* outPtr, uchar2 v)
#endif
}
inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr)
inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr) __attribute__((overloadable))
{
#ifdef cl_intel_subgroups_char
return intel_sub_group_block_read_uc2(ptr);
@ -654,6 +725,22 @@ inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr)
#endif
}
inline uchar2 FUNC(sub_group_block_read_uchar2)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc2(ptr);
#else
uint idx = get_sub_group_local_id();
uchar2 ret;
ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
return ret;
#endif
}
inline void FUNC(sub_group_block_write_uchar)(__global uchar* outPtr, uchar v)
{
#ifdef cl_intel_subgroups_char
@ -665,7 +752,7 @@ inline void FUNC(sub_group_block_write_uchar)(__global uchar* outPtr, uchar v)
#endif
}
inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr)
inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr) __attribute__((overloadable))
{
#ifdef cl_intel_subgroups_char
return intel_sub_group_block_read_uc(ptr);
@ -680,6 +767,21 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr)
#endif
}
inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc(ptr);
#else
uint idx = get_sub_group_local_id();
uchar ret;
ret = ptr[idx];
return ret;
#endif
}
#define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C)
#define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C)
#define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C)

View File

@ -149,7 +149,7 @@ Arguments common_kernel_base::GetArgsDesc(uint32_t num_of_input,
std::shared_ptr<KernelString> common_kernel_base::GetKernelString(const std::string& name,
const std::string& jit,
const std::string& entry_point,
const EngineInfo& /*engine_info*/,
const EngineInfo& engine_info,
const std::string& exe_mode) const {
std::shared_ptr<KernelString> kernel_string = std::make_shared<KernelString>();
@ -159,6 +159,10 @@ std::shared_ptr<KernelString> common_kernel_base::GetKernelString(const std::str
kernel_string->str = codes[0];
kernel_string->jit = jit;
kernel_string->options = exe_mode + " -cl-mad-enable";
if (engine_info.bOptHintsSupport)
kernel_string->options += " -DOPT_HINS_SUPPORTED=1";
if (engine_info.bLocalBlockIOSupport)
kernel_string->options += " -Dcl_intel_subgroup_local_block_io";
kernel_string->entry_point = entry_point;
kernel_string->batch_compilation = true;
}

View File

@ -385,6 +385,8 @@ struct EngineInfo {
bool bImageSupport = false;
bool bIMADSupport = false;
bool bIMMADSupport = false;
bool bOptHintsSupport = false;
bool bLocalBlockIOSupport = false;
uint32_t computeUnitsCount = 0;
uint64_t maxWorkGroupSize = 0;
uint64_t maxLocalMemSize = 0;

View File

@ -68,6 +68,9 @@ device_info_internal::device_info_internal(const cl::Device& device) {
vendor_id = static_cast<uint32_t>(device.getInfo<CL_DEVICE_VENDOR_ID>());
supports_usm = extensions.find("cl_intel_unified_shared_memory") != std::string::npos;
supports_optimization_hints = false;
supports_local_block_io = extensions.find("cl_intel_subgroup_local_block_io") != std::string::npos;
}
} // namespace gpu
} // namespace cldnn

View File

@ -29,6 +29,8 @@ struct device_info_internal : cldnn::device_info {
uint32_t dev_type;
uint32_t vendor_id;
uint8_t supports_usm;
bool supports_optimization_hints;
bool supports_local_block_io;
explicit device_info_internal(const cl::Device& device);

View File

@ -133,7 +133,9 @@ gpu_toolkit::gpu_toolkit(const device_impl& device_impl, const configuration& co
<< " local memory size: " << device_info.max_local_mem_size << "\n"
<< " fp16: " << std::boolalpha << (device_info.supports_fp16 != 0) << "\n"
<< " fp16 denorms: " << std::boolalpha << (device_info.supports_fp16_denorms != 0) << "\n"
<< " subgroups short: " << std::boolalpha << (device_info.supports_subgroups_short != 0) << std::endl;
<< " subgroups short: " << std::boolalpha << (device_info.supports_subgroups_short != 0) << "\n"
<< " local block io: " << std::boolalpha << device_info.supports_local_block_io << "\n"
<< " optimization hints: " << std::boolalpha << device_info.supports_optimization_hints << std::endl;
}
}

View File

@ -703,6 +703,8 @@ void set_params(const program_node& node, kernel_selector::params& params) {
params.engineInfo.bIMADSupport = device_info.supports_imad != 0;
params.engineInfo.bIMMADSupport = device_info.supports_immad != 0;
params.engineInfo.bImageSupport = device_info.supports_image != 0;
params.engineInfo.bOptHintsSupport = device_info.supports_optimization_hints;
params.engineInfo.bLocalBlockIOSupport = device_info.supports_local_block_io;
params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
params.engineInfo.maxImage2dWidth = device_info.max_image2d_width;