[IE CLDNN] Added int8 output suppirt into bfyx_to_fsv16 fp kernel (#2906)

2020-10-30 15:19:02 +03:00 · 2020-10-30 15:19:02 +03:00 · cf00a2f442
commit cf00a2f442
parent 924988590a
3 changed files with 51 additions and 25 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_to_b_fs_yx_fsv16.cpp
@ -50,6 +50,8 @@ ParamsKey ConvolutionKernel_bfyx_to_bfyx_f16::GetSupportedKey() const {
    k.EnableInputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
    k.EnableInputWeightsType(WeightsType::F16);
    k.EnableInputWeightsType(WeightsType::F32);
    k.EnableInputLayout(DataLayout::bfyx);
@ -67,6 +69,7 @@ ParamsKey ConvolutionKernel_bfyx_to_bfyx_f16::GetSupportedKey() const {
    k.EnableBatching();
    k.EnableSubGroup();
    k.EnableSubGroupShort();
+    k.EnableDifferentTypes();
    return k;
 }

@ -132,7 +135,7 @@ JitConstants ConvolutionKernel_bfyx_to_bfyx_f16::GetJitConstants(const convoluti
    auto blockWidth = dispatchData.cldnnStyle.blockWidth;

    if (!params.fused_ops.empty()) {
-        auto input_dt = GetUnitType(params);
+        auto input_dt = GetActivationType(params);
        FusedOpsConfiguration conf_vec = { "_VEC",
                                           {"b", "(f_block*16)", "y", "x"},
                                           "dst",
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_bfyx_f16.cl
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -13,13 +13,11 @@
 // limitations under the License.

 #include "include/include_all.cl"
-#include "include/unit_type.cl"
 #include "include/mmad.cl"

 #define FEATURE_SLICE_SIZE 16

-// OUTPUT_X_BLOCK_SIZE is one of 2, 4, 8
-#define UNIT_BLOCK_WRITEN(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, OUTPUT_X_BLOCK_SIZE)(ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITEN(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE, ptr, offset, val)

 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
@ -43,9 +41,6 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
    const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE;
    const int y = (xy / X_BLOCKS);

-    typedef MAKE_VECTOR_TYPE(UNIT_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t;
-    typedef MAKE_VECTOR_TYPE(UNIT_TYPE, 8) wei_t;
-
    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;

@ -104,12 +99,12 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
    bias_offset += split_idx * BIAS_LENGTH;
 #   endif

-    vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, bias_offset));
+    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = (MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE))(DT_INPUT_BLOCK_READ(biases, bias_offset));
 #else
-    vec_t dst = UNIT_VAL_ZERO;
+    MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) dst = INPUT0_VAL_ZERO;
 #endif

-    UNIT_TYPE line_cache[INPUT0_FEATURE_NUM * INPUT_BLOCK_SIZE];
+    INPUT0_TYPE line_cache[INPUT0_FEATURE_NUM * INPUT_BLOCK_SIZE];
    for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++)
    {
        __attribute__((opencl_unroll_hint(INPUT_BLOCK_SIZE)))
@ -125,11 +120,10 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
                                                              xb * input_x_pitch +
                                                              yb * input_y_pitch];
            else
-                line_cache[ic * INPUT_BLOCK_SIZE + i] = UNIT_VAL_ZERO;
+                line_cache[ic * INPUT_BLOCK_SIZE + i] = INPUT0_VAL_ZERO;
        }
    }

-
    __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
    for (int kh = 0; kh < FILTER_SIZE_Y; kh++)
    {
@ -138,10 +132,10 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
        {
            uint offset = filter_offset + kh * filter_y_pitch + kw * filter_x_pitch;

-            UNIT_TYPE wei[INPUT0_FEATURE_NUM];
+            FILTER_TYPE wei[INPUT0_FEATURE_NUM];
            __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
            for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++)
-                wei[ic] = UNIT_BLOCK_READ(weights, offset + ic * filter_isv_pitch);
+                wei[ic] = DT_FILTER_BLOCK_READ(weights, offset + ic * filter_isv_pitch);

            __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
            for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++)
@ -149,7 +143,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
                const uint buf_offset = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) / SUB_GROUP_SIZE;
                const uint buf_group  = (kw*DILATION_SIZE_X + STRIDE_SIZE_X * i + (kh) * INPUT_LINE_SIZE) % SUB_GROUP_SIZE;

-                UNIT_TYPE src[INPUT0_FEATURE_NUM];
+                INPUT0_TYPE src[INPUT0_FEATURE_NUM];
                __attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
                for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) {
                    src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
@ -159,17 +153,20 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
        }
    }

-    dst = ACTIVATION(dst, ACTIVATION_PARAMS);
+    MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) res;
+#ifndef HAS_FUSED_OPS
+    res = ACTIVATION(dst, ACTIVATION_PARAMS);
+#endif

 #if OUTPUT_LEFTOVERS
    if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
        for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
 #if HAS_FUSED_OPS
            FUSED_OPS_SCALAR;
-            dst[i] = FUSED_OPS_RESULT_SCALAR;
+            res[i] = FUSED_OPS_RESULT_SCALAR;
 #endif
            if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X)
-                output[output_offset + i * output_x_pitch + lid] = dst[i];
+                output[output_offset + i * output_x_pitch + lid] = res[i];
        }
    }
    else
@ -178,17 +175,17 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
 #if HAS_FUSED_OPS
            FUSED_OPS_VEC;
-            dst = FUSED_OPS_RESULT_VEC;
+            res = FUSED_OPS_RESULT_VEC;
 #endif
-            UNIT_BLOCK_WRITEN(output, output_offset, dst);
+            DT_OUTPUT_BLOCK_WRITEN(output, output_offset, res);
        } else {
-            const int x_tail = OUTPUT_SIZE_X - x;
+            const int x_tail = OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE;
            for (int i = 0; i < x_tail; i++) {
 #if HAS_FUSED_OPS
-            FUSED_OPS_SCALAR;
-            dst[i] = FUSED_OPS_RESULT_SCALAR;
+                FUSED_OPS_SCALAR;
+                res[i] = FUSED_OPS_RESULT_SCALAR;
 #endif
-                UNIT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, dst[i]);
+                DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
            }
        }
    }
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -472,6 +472,7 @@ public:
 #define CASE_CONV_FP32_11 {1, 32, 4, 5, 4}, {1, 16, 2, 3, 2}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx
 #define CASE_CONV_FP32_12 {1, 16, 4, 5, 4}, {1, 16, 2, 3, 2}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx
 #define CASE_CONV_FP32_13 {1, 16, 18, 5, 4}, {1, 16, 16, 3, 2}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx
+#define CASE_CONV_FP32_14 {1, 3, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::bfyx, data_types::f32, format::bfyx

 #define CASE_CONV_FP16_1 {1, 15, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx
 #define CASE_CONV_FP16_2 {1, 16, 4, 5}, {1, 32, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::os_is_yx_isv16_osv16, data_types::f16, format::bfyx
@ -853,6 +854,31 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_eltwise_b_fs_zyx_fsv16,
                                bc_test_params{CASE_CONV_FP16_12, 2, 3},
                        }), );

+class conv_fp32_quantize_u8_first_conv : public ConvFusingTest {};
+TEST_P(conv_fp32_quantize_u8_first_conv, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                 data("weights", get_mem(get_weights_layout(p))),
+                 data("bias", get_mem(get_bias_layout(p))),
+                 data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+                 data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+                 data("out_lo", get_mem(get_single_element_layout(p), 0)),
+                 data("out_hi", get_mem(get_single_element_layout(p), 255)),
+                 reorder("reordered_input", "input", format::b_fs_yx_fsv16, p.data_type),
+                 convolution("conv_prim", "reordered_input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+                 quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
+                 reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1.0f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_quantize_u8_first_conv,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                bc_test_params{CASE_CONV_FP32_14, 2, 3},
+                        }), );
+
 class conv_fp32_quantize_u8 : public ConvFusingTest {};
 TEST_P(conv_fp32_quantize_u8, basic) {
    auto p = GetParam();