[GPU] Merge activation_ref kernel update. (#7767)

+ Support b_fs_yx_fsv32, bs_fs_yx_bsv32_fsv32 and bs_fs_yx_bsv32_fsv16 format.
2021-10-06 12:38:54 +09:00
parent dffe4a4251
commit d29648164a
2 changed files with 220 additions and 32 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_ref.cl
@@ -33,43 +33,53 @@ KERNEL(activation)(
    const unsigned x = get_global_id(0);
    const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
    const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
-#if OUTPUT_BATCH_NUM == 1
-    const unsigned feature = (uint)get_global_id(2);
-    const unsigned batch = 0;
+    #if OUTPUT_BATCH_NUM == 1
+        const unsigned feature = (uint)get_global_id(2);
+        const unsigned batch = 0;
+    #else
+        const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
+        const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
+    #endif
 #else
-    const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
-    const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
-#endif
-#else
-#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16
-    const unsigned x = (uint)get_global_id(1);
-    const unsigned y = (uint)get_global_id(2);
-#define z 0
-#if OUTPUT_BATCH_NUM == 1
-    const unsigned feature = (uint)get_global_id(0);
-    const unsigned batch = 0;
-#else
-    const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM;
-    const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM;
-#endif
-#else
-#define z 0
-    const unsigned x = (uint)get_global_id(0);
-    const unsigned y = (uint)get_global_id(1);
-#if OUTPUT_BATCH_NUM == 1
-    const unsigned feature = (uint)get_global_id(2);
-    const unsigned batch = 0;
-#else
-    const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
-    const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
-#endif
-#endif
+    #if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32
+        const unsigned x = (uint)get_global_id(1);
+        const unsigned y = (uint)get_global_id(2);
+        #define z 0
+        #if OUTPUT_BATCH_NUM == 1
+            const unsigned feature = (uint)get_global_id(0);
+            const unsigned batch = 0;
+        #else
+            const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM;
+            const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM;
+        #endif
+    #elif defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32 || defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16
+        const unsigned x = (uint)get_global_id(0) % OUTPUT_SIZE_X;
+        const unsigned y = (uint)get_global_id(0) / OUTPUT_SIZE_X;
+        const unsigned feature = (uint)get_global_id(1);
+        const unsigned batch = (uint)get_global_id(2);
+    #else
+        #define z 0
+            const unsigned x = (uint)get_global_id(0);
+            const unsigned y = (uint)get_global_id(1);
+        #if OUTPUT_BATCH_NUM == 1
+            const unsigned feature = (uint)get_global_id(2);
+            const unsigned batch = 0;
+        #else
+            const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
+            const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
+        #endif
+    #endif
 #endif

-#if defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) && OUTPUT_FEATURE_NUM % 16 != 0
-    // b_fs_yx_fsv16 has dispatch features aligned to multiple of 16
+// GWS.feature and GWS.batch is aligned to 16. Otherwise, there are some idling WIs.
+#if (defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) || defined(OUTPUT_LAYOUT_B_FS_YX_FSV32)) \
+    && OUTPUT_FEATURE_NUM % 16 != 0
    if (feature >= OUTPUT_FEATURE_NUM)
        return;
+#elif (defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16) || defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32)) \
+    && (OUTPUT_FEATURE_NUM % 16 != 0 || OUTPUT_BATCH_NUM % 16 != 0)
+    if (batch >= OUTPUT_BATCH_NUM || feature >= OUTPUT_FEATURE_NUM)
+        return;
 #endif

    const unsigned src_index = GET_INDEX(INPUT,0,ORDER);
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp
@@ -1459,3 +1459,181 @@ TEST(activation_f32_fw_gpu, b_fs_yx_fsv16_prelu) {
        EXPECT_EQ(expected[i], out_ptr[i]) << "at i=" << i;
    }
 }
+
+struct activation_random_test_params {
+    data_types input_type;
+    format::type input_format;
+    tensor input_size;
+    activation_func func_type;
+    activation_additional_params additional_params;
+    padding padd;
+};
+
+struct activation_random_test : testing::TestWithParam<activation_random_test_params>
+{
+    bool enable_profiling = false;
+
+    size_t get_x_pitch(layout& layout) {
+        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
+        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
+        auto x0 = layout.get_linear_offset(tensor_x0);
+        auto x1 = layout.get_linear_offset(tensor_x1);
+        return (x1 - x0);
+    }
+
+    template <typename T>
+    void fill_random_typed(memory::ptr mem, int min, int max, int k) {
+        auto size = mem->get_layout().size;
+        size_t b = size.batch[0];
+        size_t f = size.feature[0];
+        size_t x = size.spatial[0];
+        size_t y = size.spatial[1];
+
+        auto data = generate_random_4d<T>(b, f, y, x, min, max, k);
+        mem_lock<T> ptr{mem, get_test_stream()};
+        for (size_t bi = 0; bi < b; ++bi) {
+            for (size_t fi = 0; fi < f; ++fi) {
+                for (size_t yi = 0; yi < y; ++yi) {
+                    for (size_t xi = 0; xi < x; ++xi) {
+                        auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        auto offset = mem->get_layout().get_linear_offset(coords);
+                        ptr[offset] = data[bi][fi][yi][xi];
+                    }
+                }
+            }
+        }
+    }
+
+    void fill_random(memory::ptr mem) {
+        auto dt = mem->get_layout().data_type;
+        switch (dt) {
+        case data_types::f32:
+            fill_random_typed<float>(mem, -127, 127, 2);
+            break;
+        case data_types::f16:
+            fill_random_typed<FLOAT16>(mem, -127, 127, 2);
+            break;
+        case data_types::i8:
+            fill_random_typed<int8_t>(mem, -127, 127, 1);
+            break;
+        case data_types::u8:
+            fill_random_typed<uint8_t>(mem, 0, 255, 1);
+            break;
+        default:
+            break;
+        }
+    }
+
+    template <typename T>
+    bool compare_outputs(const memory::ptr out_ref, const memory::ptr out_opt) {
+        auto output_lay = out_ref->get_layout();
+        auto opt_output_lay = out_opt->get_layout();
+        size_t b = output_lay.size.batch[0];
+        size_t f = output_lay.size.feature[0];
+        size_t x = output_lay.size.spatial[0];
+        size_t y = output_lay.size.spatial[1];
+        cldnn::mem_lock<T> ref_ptr(out_ref, get_test_stream());
+        cldnn::mem_lock<T> opt_ptr(out_opt, get_test_stream());
+
+        auto ref_x_pitch = get_x_pitch(output_lay);
+        auto opt_x_pitch = get_x_pitch(opt_output_lay);
+
+        for (size_t bi = 0; bi < b; ++bi) {
+            for (size_t fi = 0; fi < f; ++fi) {
+                for (size_t yi = 0; yi < y; ++yi) {
+                    auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(0, yi, 0, 0));
+                    auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords);
+                    auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords);
+                    for (size_t xi = 0; xi < x; ++xi) {
+                        auto ref_out_val = ref_ptr[ref_out_offset + xi * ref_x_pitch];
+                        auto opt_out_val = opt_ptr[opt_out_offset + xi * opt_x_pitch];
+                        EXPECT_EQ(ref_out_val, opt_out_val);
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    void execute_compare(const activation_random_test_params& params, bool check_result) {
+        auto& engine = get_test_engine();
+
+        auto in_layout = layout(params.input_type, format::bfyx, params.input_size);
+        auto in_mem = engine.allocate_memory(in_layout);
+        fill_random(in_mem);
+
+        /// bfyx
+        cldnn::topology topo;
+        topo.add(input_layout("in", in_layout));
+        auto prim = activation("activation", "in", params.func_type);
+        prim.additional_params = params.additional_params;
+        topo.add(prim);
+
+        auto build_opts = build_options();
+        build_opts.set_option(build_option::outputs({"activation"}));
+
+        network net(engine, topo, build_opts);
+        net.set_input_data("in", in_mem);
+
+        // first execution of ref
+        auto result = net.execute();
+        auto output = result.at("activation").get_memory();
+
+        cldnn::topology topo_opt;
+        topo_opt.add(input_layout("in", in_layout));
+        topo_opt.add(reorder("in_to_input_type", "in", params.input_format, params.input_type));
+        auto prim_opt = activation("activation_blocked", "in_to_input_type", params.func_type);
+        prim_opt.additional_params = params.additional_params;
+        topo_opt.add(prim_opt);
+        // force output format to input format.
+        topo_opt.add(reorder("res_to_input_format", "activation_blocked", params.input_format, params.input_type));
+
+        auto build_opts_opt = build_options();
+        build_opts_opt.set_option(build_option::outputs({"activation_blocked", "res_to_input_format"}));
+        auto activation_impl_desc = implementation_desc();
+        activation_impl_desc.output_format = params.input_format;
+        build_opts_opt.set_option(build_option::force_implementations({{"activation_blocked", {params.input_format, "activation_ref"} }}));
+
+        network net_opt(engine, topo_opt, build_opts_opt);
+
+        // Use in_mem from ref network
+        net_opt.set_input_data("in", in_mem);
+
+        // first execution of opt
+        auto result_opt = net_opt.execute();
+        auto output_opt = result_opt.at("res_to_input_format").get_memory();
+
+        if (check_result == true) {
+            // Check data_types
+            if (params.input_type == data_types::f32) {
+                compare_outputs<float>(output, output_opt);
+            } else if (params.input_type == data_types::f16) {
+                compare_outputs<FLOAT16>(output, output_opt);
+            } else if (params.input_type == data_types::i8) {
+                compare_outputs<int8_t>(output, output_opt);
+            } else if (params.input_type == data_types::u8) {
+                compare_outputs<uint8_t>(output, output_opt);
+            } else {
+                FAIL() << "Not supported data type: " << static_cast<size_t>(params.input_type);
+            }
+        }
+    }
+};
+
+TEST_P(activation_random_test, random) {
+    auto param = GetParam();
+    execute_compare(param, true);
+}
+
+INSTANTIATE_TEST_SUITE_P(activation_blocked_tests,
+                         activation_random_test,
+                         testing::ValuesIn(
+                            std::vector<activation_random_test_params>{
+                                { data_types::i8,  format::b_fs_yx_fsv32,        { 1, 32, 5, 5}, activation_func::relu, {}, {}},
+                                { data_types::i8,  format::bs_fs_yx_bsv32_fsv32, {32, 32, 5, 5}, activation_func::relu, {}, {}},
+                                { data_types::f16, format::bs_fs_yx_bsv32_fsv16, {32, 32, 5, 5}, activation_func::relu, {}, {}},
+                                { data_types::i8,  format::bs_fs_yx_bsv32_fsv32, {16, 16, 5, 5}, activation_func::relu, {}, {}},
+                                { data_types::f16, format::bs_fs_yx_bsv32_fsv16, {16, 16, 5, 5}, activation_func::relu, {}, {}},
+                            }
+                        ));