diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl index 88e4f708a47..d7af01e934d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/common.cl @@ -29,3 +29,9 @@ #define OFFSET_GLOBAL_PTR(elem_type, ptr, byte_offset) ((__global elem_type*)((__global char*)(ptr) + (byte_offset))) #define MULTIPLY_OFFSET(elem_type, byte_offset) ((byte_offset) * sizeof(elem_type)) + +#if OPT_HINTS_SUPPORTED +# define ASSUME_HINT(x) __builtin_assume(x) +#else +# define ASSUME_HINT(x) do { } while (0) +#endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl index eb8032f1f75..b0ad478ed65 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl @@ -58,6 +58,11 @@ // For more details and description of intel_sub_group_block_read/write functions please, // refer to cl_intel_subgroups extension documentation. // +// BLOCK_READN_SLM(type, vector_size, ptr, offset) +// - performs same operation as BLOCK_READN, but with "ptr" being in __local address space. +// BLOCK_WRITEN_SLM(type, vector_size, ptr, offset, val) +// - performs same operation as BLOCK_READN, but with "ptr" being in __local address space. +// // type [PP] - Must evaluate to non-vectorized type, ex. float, half, char, etc.. // vector_size [PP] - Number of elements to read/write, ex 2 for intel_sub_group_block_read2. // ptr - Pointer to global memory where to read from/write to. @@ -101,17 +106,22 @@ #define BLOCK_WRITEN_FUNC_size4(vector_size) BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size) #define BLOCK_WRITEN_FUNC(type_size, vector_size) CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size) -#define BLOCK_READN_RAW(type_size, vector_size, ptr, offset) \ - BLOCK_READN_FUNC(type_size, vector_size)((const __global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset)) -#define BLOCK_WRITEN_RAW(type_size, vector_size, ptr, offset, val) \ +#define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset) \ + BLOCK_READN_FUNC(type_size, vector_size)((const addr_space BLOCK_RW_TYPE(type_size)*)(ptr) + (offset)) +#define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val) \ BLOCK_WRITEN_FUNC(type_size, vector_size)( \ - (__global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset), \ + (addr_space BLOCK_RW_TYPE(type_size)*)(ptr) + (offset), \ AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_RW_TYPE(type_size), vector_size), val)) #define BLOCK_READN(type, vector_size, ptr, offset) \ - AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, ptr, offset)) + AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, __global, ptr, offset)) #define BLOCK_WRITEN(type, vector_size, ptr, offset, val) \ - BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, ptr, offset, val) + BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, __global, ptr, offset, val) + +#define BLOCK_READN_SLM(type, vector_size, ptr, offset) \ + AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, __local, ptr, offset)) +#define BLOCK_WRITEN_SLM(type, vector_size, ptr, offset, val) \ + BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, __local, ptr, offset, val) #define DT_INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset) #define DT_INPUT_BLOCK_READ2(ptr, offset) BLOCK_READN(INPUT0_TYPE, 2, ptr, offset) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl index 3aab5038624..6ea974f1436 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl @@ -521,7 +521,7 @@ inline void FUNC(sub_group_block_write_uchar16)(__global uchar* outPtr, uchar16 #endif } -inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr) +inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr) __attribute__((overloadable)) { #ifdef cl_intel_subgroups_char // WA for compiler support @@ -553,6 +553,38 @@ inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr) #endif } +inline uchar16 FUNC(sub_group_block_read_uchar16)(const __local uchar* ptr) __attribute__((overloadable)) +{ +#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char) + // WA for compiler support + // return intel_sub_group_block_read_uc16(ptr); + return (uchar16)(intel_sub_group_block_read_uc8(ptr), intel_sub_group_block_read_uc8(ptr + 8 * get_max_sub_group_size())); +#else + uint idx = get_sub_group_local_id(); + + uchar16 ret; + + ret.s0 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s1 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s2 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s3 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s4 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s5 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s6 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s7 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s8 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s9 = ptr[idx]; idx += get_max_sub_group_size(); + ret.sa = ptr[idx]; idx += get_max_sub_group_size(); + ret.sb = ptr[idx]; idx += get_max_sub_group_size(); + ret.sc = ptr[idx]; idx += get_max_sub_group_size(); + ret.sd = ptr[idx]; idx += get_max_sub_group_size(); + ret.se = ptr[idx]; idx += get_max_sub_group_size(); + ret.sf = ptr[idx]; idx += get_max_sub_group_size(); + + return ret; +#endif +} + inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v) { #ifdef cl_intel_subgroups_char @@ -571,7 +603,7 @@ inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v) #endif } -inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) +inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) __attribute__((overloadable)) { #ifdef cl_intel_subgroups_char return intel_sub_group_block_read_uc8(ptr); @@ -590,7 +622,28 @@ inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) ret.s7 = ptr[idx]; idx += get_max_sub_group_size(); return ret; +#endif +} +inline uchar8 FUNC(sub_group_block_read_uchar8)(const __local uchar* ptr) __attribute__((overloadable)) +{ +#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char) + return intel_sub_group_block_read_uc8(ptr); +#else + uint idx = get_sub_group_local_id(); + + uchar8 ret; + + ret.s0 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s1 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s2 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s3 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s4 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s5 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s6 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s7 = ptr[idx]; idx += get_max_sub_group_size(); + + return ret; #endif } @@ -608,7 +661,7 @@ inline void FUNC(sub_group_block_write_uchar4)(__global uchar* outPtr, uchar4 v) #endif } -inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr) +inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr) __attribute__((overloadable)) { #ifdef cl_intel_subgroups_char return intel_sub_group_block_read_uc4(ptr); @@ -626,6 +679,24 @@ inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr) #endif } +inline uchar4 FUNC(sub_group_block_read_uchar4)(const __local uchar* ptr) __attribute__((overloadable)) +{ +#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char) + return intel_sub_group_block_read_uc4(ptr); +#else + uint idx = get_sub_group_local_id(); + + uchar4 ret; + + ret.s0 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s1 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s2 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s3 = ptr[idx]; idx += get_max_sub_group_size(); + + return ret; +#endif +} + inline void FUNC(sub_group_block_write_uchar2)(__global uchar* outPtr, uchar2 v) { #ifdef cl_intel_subgroups_char @@ -638,7 +709,7 @@ inline void FUNC(sub_group_block_write_uchar2)(__global uchar* outPtr, uchar2 v) #endif } -inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr) +inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr) __attribute__((overloadable)) { #ifdef cl_intel_subgroups_char return intel_sub_group_block_read_uc2(ptr); @@ -654,6 +725,22 @@ inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr) #endif } +inline uchar2 FUNC(sub_group_block_read_uchar2)(const __local uchar* ptr) __attribute__((overloadable)) +{ +#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char) + return intel_sub_group_block_read_uc2(ptr); +#else + uint idx = get_sub_group_local_id(); + + uchar2 ret; + + ret.s0 = ptr[idx]; idx += get_max_sub_group_size(); + ret.s1 = ptr[idx]; idx += get_max_sub_group_size(); + + return ret; +#endif +} + inline void FUNC(sub_group_block_write_uchar)(__global uchar* outPtr, uchar v) { #ifdef cl_intel_subgroups_char @@ -665,7 +752,7 @@ inline void FUNC(sub_group_block_write_uchar)(__global uchar* outPtr, uchar v) #endif } -inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr) +inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr) __attribute__((overloadable)) { #ifdef cl_intel_subgroups_char return intel_sub_group_block_read_uc(ptr); @@ -680,6 +767,21 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr) #endif } +inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attribute__((overloadable)) +{ +#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char) + return intel_sub_group_block_read_uc(ptr); +#else + uint idx = get_sub_group_local_id(); + + uchar ret; + + ret = ptr[idx]; + + return ret; +#endif +} + #define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C) #define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C) #define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp index cd8ebb76a20..fb84aa9c33d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp @@ -149,7 +149,7 @@ Arguments common_kernel_base::GetArgsDesc(uint32_t num_of_input, std::shared_ptr common_kernel_base::GetKernelString(const std::string& name, const std::string& jit, const std::string& entry_point, - const EngineInfo& /*engine_info*/, + const EngineInfo& engine_info, const std::string& exe_mode) const { std::shared_ptr kernel_string = std::make_shared(); @@ -159,6 +159,10 @@ std::shared_ptr common_kernel_base::GetKernelString(const std::str kernel_string->str = codes[0]; kernel_string->jit = jit; kernel_string->options = exe_mode + " -cl-mad-enable"; + if (engine_info.bOptHintsSupport) + kernel_string->options += " -DOPT_HINS_SUPPORTED=1"; + if (engine_info.bLocalBlockIOSupport) + kernel_string->options += " -Dcl_intel_subgroup_local_block_io"; kernel_string->entry_point = entry_point; kernel_string->batch_compilation = true; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h index d50e91cef56..436928b2fe3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h @@ -385,6 +385,8 @@ struct EngineInfo { bool bImageSupport = false; bool bIMADSupport = false; bool bIMMADSupport = false; + bool bOptHintsSupport = false; + bool bLocalBlockIOSupport = false; uint32_t computeUnitsCount = 0; uint64_t maxWorkGroupSize = 0; uint64_t maxLocalMemSize = 0; diff --git a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp index 1402e05d551..f9b1930e1e4 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp @@ -68,6 +68,9 @@ device_info_internal::device_info_internal(const cl::Device& device) { vendor_id = static_cast(device.getInfo()); supports_usm = extensions.find("cl_intel_unified_shared_memory") != std::string::npos; + + supports_optimization_hints = false; + supports_local_block_io = extensions.find("cl_intel_subgroup_local_block_io") != std::string::npos; } } // namespace gpu } // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/gpu/device_info.h b/inference-engine/thirdparty/clDNN/src/gpu/device_info.h index 4ca0aaa2ec4..076bf76b034 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.h @@ -29,6 +29,8 @@ struct device_info_internal : cldnn::device_info { uint32_t dev_type; uint32_t vendor_id; uint8_t supports_usm; + bool supports_optimization_hints; + bool supports_local_block_io; explicit device_info_internal(const cl::Device& device); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp index 5e7ba6da211..c3da4f20027 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp @@ -133,7 +133,9 @@ gpu_toolkit::gpu_toolkit(const device_impl& device_impl, const configuration& co << " local memory size: " << device_info.max_local_mem_size << "\n" << " fp16: " << std::boolalpha << (device_info.supports_fp16 != 0) << "\n" << " fp16 denorms: " << std::boolalpha << (device_info.supports_fp16_denorms != 0) << "\n" - << " subgroups short: " << std::boolalpha << (device_info.supports_subgroups_short != 0) << std::endl; + << " subgroups short: " << std::boolalpha << (device_info.supports_subgroups_short != 0) << "\n" + << " local block io: " << std::boolalpha << device_info.supports_local_block_io << "\n" + << " optimization hints: " << std::boolalpha << device_info.supports_optimization_hints << std::endl; } } diff --git a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp index 94ef808e9ec..88c47643528 100644 --- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp +++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp @@ -703,6 +703,8 @@ void set_params(const program_node& node, kernel_selector::params& params) { params.engineInfo.bIMADSupport = device_info.supports_imad != 0; params.engineInfo.bIMMADSupport = device_info.supports_immad != 0; params.engineInfo.bImageSupport = device_info.supports_image != 0; + params.engineInfo.bOptHintsSupport = device_info.supports_optimization_hints; + params.engineInfo.bLocalBlockIOSupport = device_info.supports_local_block_io; params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size; params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size; params.engineInfo.maxImage2dWidth = device_info.max_image2d_width;