[IE CLDNN] Add additional check for local block io support (#1211)

This change is needed, because some ocl compiler versions may advertise support for extension, but fail to compile some of the functions.
2020-07-05 17:56:13 +02:00 · 2020-07-05 17:56:13 +02:00 · fee4a01b26
commit fee4a01b26
parent df772e082a
3 changed files with 52 additions and 7 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl
@ -555,7 +555,7 @@ inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr) __a

 inline uchar16 FUNC(sub_group_block_read_uchar16)(const __local uchar* ptr) __attribute__((overloadable))
 {
-#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
+#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
    // WA for compiler support
    // return intel_sub_group_block_read_uc16(ptr);
    return (uchar16)(intel_sub_group_block_read_uc8(ptr), intel_sub_group_block_read_uc8(ptr + 8 * get_max_sub_group_size()));
@ -627,7 +627,7 @@ inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr) __att

 inline uchar8 FUNC(sub_group_block_read_uchar8)(const __local uchar* ptr) __attribute__((overloadable))
 {
-#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
+#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
    return intel_sub_group_block_read_uc8(ptr);
 #else
    uint idx = get_sub_group_local_id();
@ -681,7 +681,7 @@ inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr) __att

 inline uchar4 FUNC(sub_group_block_read_uchar4)(const __local uchar* ptr) __attribute__((overloadable))
 {
-#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
+#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
    return intel_sub_group_block_read_uc4(ptr);
 #else
    uint idx = get_sub_group_local_id();
@ -727,7 +727,7 @@ inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr) __att

 inline uchar2 FUNC(sub_group_block_read_uchar2)(const __local uchar* ptr) __attribute__((overloadable))
 {
-#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
+#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
    return intel_sub_group_block_read_uc2(ptr);
 #else
    uint idx = get_sub_group_local_id();
@ -769,7 +769,7 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr) __attri

 inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attribute__((overloadable))
 {
-#if defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
+#if LOCAL_BLOCK_IO_SUPPORTED && defined(cl_intel_subgroup_local_block_io) && defined(cl_intel_subgroups_char)
    return intel_sub_group_block_read_uc(ptr);
 #else
    uint idx = get_sub_group_local_id();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp
@ -152,7 +152,7 @@ std::shared_ptr<KernelString> common_kernel_base::GetKernelString(const std::str
        if (engine_info.bOptHintsSupport)
            kernel_string->options += " -DOPT_HINS_SUPPORTED=1";
        if (engine_info.bLocalBlockIOSupport)
-            kernel_string->options += " -Dcl_intel_subgroup_local_block_io";
+            kernel_string->options += " -Dcl_intel_subgroup_local_block_io -DLOCAL_BLOCK_IO_SUPPORTED=1";
        kernel_string->entry_point = entry_point;
        kernel_string->batch_compilation = true;
    }
--- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp
@ -30,6 +30,50 @@
 namespace cldnn {
 namespace gpu {

+namespace {
+
+bool is_local_block_io_supported(const cl::Device& device) {
+    try {
+        cl::Context ctx(device);
+        std::string kernel_code =
+            "__attribute__((intel_reqd_sub_group_size(8)))"
+            "__attribute__((reqd_work_group_size(8, 1, 1)))"
+            "void kernel is_local_block_io_supported(global uchar* dst) {"
+            "    uint lid = get_sub_group_local_id();"
+            "    uchar val = (uchar)lid * 2;"
+            "    __local uchar tmp_slm[8];"
+            "    intel_sub_group_block_write_uc2(tmp_slm, (uchar2)(val));"
+            "    barrier(CLK_LOCAL_MEM_FENCE);"
+            "    uchar2 read = intel_sub_group_block_read_uc2(tmp_slm);"
+            "    dst[lid] = read.s0 + 1;"
+            "}";
+        cl::Program program(ctx, kernel_code);
+        if (program.build({ device }, "-Dcl_intel_subgroup_local_block_io") != CL_SUCCESS)
+            return false;
+        cl::Buffer buffer(ctx, CL_MEM_READ_WRITE, sizeof(uint8_t) * 8);
+        cl::Kernel kernel(program, "is_local_block_io_supported");
+        kernel.setArg(0, buffer);
+
+        cl::Event ev;
+        cl::CommandQueue queue(ctx, device);
+        queue.enqueueNDRangeKernel(kernel, cl::NDRange(), cl::NDRange(8), cl::NDRange(8), nullptr, &ev);
+        ev.wait();
+
+        uint8_t result[8];
+        uint8_t expected[8] = { 1, 3, 5, 7, 9, 11, 13, 15 };
+        queue.enqueueReadBuffer(buffer, CL_TRUE, 0, sizeof(uint8_t) * 8, &result);
+        for (int i = 0; i < 8; ++i) {
+            if (result[i] != expected[i])
+                return false;
+        }
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+}  // namespace
+
 device_info_internal::device_info_internal(const cl::Device& device) {
    dev_name = device.getInfo<CL_DEVICE_NAME>();
    driver_version = device.getInfo<CL_DRIVER_VERSION>();
@ -70,7 +114,8 @@ device_info_internal::device_info_internal(const cl::Device& device) {
    supports_usm = extensions.find("cl_intel_unified_shared_memory") != std::string::npos;

    supports_optimization_hints = false;
-    supports_local_block_io = extensions.find("cl_intel_subgroup_local_block_io") != std::string::npos;
+    supports_local_block_io = extensions.find("cl_intel_subgroup_local_block_io") != std::string::npos &&
+                              is_local_block_io_supported(device);
 }
 }  // namespace gpu
 }  // namespace cldnn