[IE CLDNN] Add additional check for local block io support (#1211)

This change is needed, because some ocl compiler versions may advertise
support for extension, but fail to compile some of the functions.
This commit is contained in:
Konrad Dobros 2020-07-05 17:56:13 +02:00 committed by GitHub
parent df772e082a
commit fee4a01b26
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 52 additions and 7 deletions

View File

@ -555,7 +555,7 @@ inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr) __a
inline uchar16 FUNC(sub_group_block_read_uchar16)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
// WA for compiler support
// return intel_sub_group_block_read_uc16(ptr);
return (uchar16)(intel_sub_group_block_read_uc8(ptr), intel_sub_group_block_read_uc8(ptr + 8 * get_max_sub_group_size()));
@ -627,7 +627,7 @@ inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) __att
inline uchar8 FUNC(sub_group_block_read_uchar8)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc8(ptr);
#else
uint idx = get_sub_group_local_id();
@ -681,7 +681,7 @@ inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr) __att
inline uchar4 FUNC(sub_group_block_read_uchar4)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc4(ptr);
#else
uint idx = get_sub_group_local_id();
@ -727,7 +727,7 @@ inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr) __att
inline uchar2 FUNC(sub_group_block_read_uchar2)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc2(ptr);
#else
uint idx = get_sub_group_local_id();
@ -769,7 +769,7 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr) __attri
inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attribute__((overloadable))
{
#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
return intel_sub_group_block_read_uc(ptr);
#else
uint idx = get_sub_group_local_id();

View File

@ -152,7 +152,7 @@ std::shared_ptr<KernelString> common_kernel_base::GetKernelString(const std::str
if (engine_info.bOptHintsSupport)
kernel_string->options += " -DOPT_HINS_SUPPORTED=1";
if (engine_info.bLocalBlockIOSupport)
kernel_string->options += " -Dcl_intel_subgroup_local_block_io";
kernel_string->options += " -Dcl_intel_subgroup_local_block_io -DLOCAL_BLOCK_IO_SUPPORTED=1";
kernel_string->entry_point = entry_point;
kernel_string->batch_compilation = true;
}

View File

@ -30,6 +30,50 @@
namespace cldnn {
namespace gpu {
namespace {
bool is_local_block_io_supported(const cl::Device& device) {
try {
cl::Context ctx(device);
std::string kernel_code =
"__attribute__((intel_reqd_sub_group_size(8)))"
"__attribute__((reqd_work_group_size(8, 1, 1)))"
"void kernel is_local_block_io_supported(global uchar* dst) {"
" uint lid = get_sub_group_local_id();"
" uchar val = (uchar)lid * 2;"
" __local uchar tmp_slm[8];"
" intel_sub_group_block_write_uc2(tmp_slm, (uchar2)(val));"
" barrier(CLK_LOCAL_MEM_FENCE);"
" uchar2 read = intel_sub_group_block_read_uc2(tmp_slm);"
" dst[lid] = read.s0 + 1;"
"}";
cl::Program program(ctx, kernel_code);
if (program.build({ device }, "-Dcl_intel_subgroup_local_block_io") != CL_SUCCESS)
return false;
cl::Buffer buffer(ctx, CL_MEM_READ_WRITE, sizeof(uint8_t) * 8);
cl::Kernel kernel(program, "is_local_block_io_supported");
kernel.setArg(0, buffer);
cl::Event ev;
cl::CommandQueue queue(ctx, device);
queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(8), cl::NDRange(8), nullptr, &ev);
ev.wait();
uint8_t result[8];
uint8_t expected[8] = { 1, 3, 5, 7, 9, 11, 13, 15 };
queue.enqueueReadBuffer(buffer, CL_TRUE, 0, sizeof(uint8_t) * 8, &result);
for (int i = 0; i < 8; ++i) {
if (result[i] != expected[i])
return false;
}
return true;
} catch (...) {
return false;
}
}
} // namespace
device_info_internal::device_info_internal(const cl::Device& device) {
dev_name = device.getInfo<CL_DEVICE_NAME>();
driver_version = device.getInfo<CL_DRIVER_VERSION>();
@ -70,7 +114,8 @@ device_info_internal::device_info_internal(const cl::Device& device) {
supports_usm = extensions.find("cl_intel_unified_shared_memory") != std::string::npos;
supports_optimization_hints = false;
supports_local_block_io = extensions.find("cl_intel_subgroup_local_block_io") != std::string::npos;
supports_local_block_io = extensions.find("cl_intel_subgroup_local_block_io") != std::string::npos &&
is_local_block_io_supported(device);
}
} // namespace gpu
} // namespace cldnn