From d29648164aecaf3f6f6090854ba442565bc6203b Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Wed, 6 Oct 2021 12:38:54 +0900 Subject: [PATCH] [GPU] Merge activation_ref kernel update. (#7767) + Support b_fs_yx_fsv32, bs_fs_yx_bsv32_fsv32 and bs_fs_yx_bsv32_fsv16 format. --- .../core/cl_kernels/activation_ref.cl | 74 ++++---- .../test_cases/activation_simple_gpu_test.cpp | 178 ++++++++++++++++++ 2 files changed, 220 insertions(+), 32 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_ref.cl index 0aacaaa07e1..a490602414e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_ref.cl @@ -33,43 +33,53 @@ KERNEL(activation)( const unsigned x = get_global_id(0); const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y; -#if OUTPUT_BATCH_NUM == 1 - const unsigned feature = (uint)get_global_id(2); - const unsigned batch = 0; + #if OUTPUT_BATCH_NUM == 1 + const unsigned feature = (uint)get_global_id(2); + const unsigned batch = 0; + #else + const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM; + const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM; + #endif #else - const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM; - const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM; -#endif -#else -#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 - const unsigned x = (uint)get_global_id(1); - const unsigned y = (uint)get_global_id(2); -#define z 0 -#if OUTPUT_BATCH_NUM == 1 - const unsigned feature = (uint)get_global_id(0); - const unsigned batch = 0; -#else - const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM; - const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM; -#endif -#else -#define z 0 - const unsigned x = (uint)get_global_id(0); - const unsigned y = (uint)get_global_id(1); -#if OUTPUT_BATCH_NUM == 1 - const unsigned feature = (uint)get_global_id(2); - const unsigned batch = 0; -#else - const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM; - const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM; -#endif -#endif + #if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32 + const unsigned x = (uint)get_global_id(1); + const unsigned y = (uint)get_global_id(2); + #define z 0 + #if OUTPUT_BATCH_NUM == 1 + const unsigned feature = (uint)get_global_id(0); + const unsigned batch = 0; + #else + const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM; + const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM; + #endif + #elif defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32 || defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16 + const unsigned x = (uint)get_global_id(0) % OUTPUT_SIZE_X; + const unsigned y = (uint)get_global_id(0) / OUTPUT_SIZE_X; + const unsigned feature = (uint)get_global_id(1); + const unsigned batch = (uint)get_global_id(2); + #else + #define z 0 + const unsigned x = (uint)get_global_id(0); + const unsigned y = (uint)get_global_id(1); + #if OUTPUT_BATCH_NUM == 1 + const unsigned feature = (uint)get_global_id(2); + const unsigned batch = 0; + #else + const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM; + const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM; + #endif + #endif #endif -#if defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) && OUTPUT_FEATURE_NUM % 16 != 0 - // b_fs_yx_fsv16 has dispatch features aligned to multiple of 16 +// GWS.feature and GWS.batch is aligned to 16. Otherwise, there are some idling WIs. +#if (defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) || defined(OUTPUT_LAYOUT_B_FS_YX_FSV32)) \ + && OUTPUT_FEATURE_NUM % 16 != 0 if (feature >= OUTPUT_FEATURE_NUM) return; +#elif (defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16) || defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32)) \ + && (OUTPUT_FEATURE_NUM % 16 != 0 || OUTPUT_BATCH_NUM % 16 != 0) + if (batch >= OUTPUT_BATCH_NUM || feature >= OUTPUT_FEATURE_NUM) + return; #endif const unsigned src_index = GET_INDEX(INPUT,0,ORDER); diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp index 5f54dabf7f4..509108f437f 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/activation_simple_gpu_test.cpp @@ -1459,3 +1459,181 @@ TEST(activation_f32_fw_gpu, b_fs_yx_fsv16_prelu) { EXPECT_EQ(expected[i], out_ptr[i]) << "at i=" << i; } } + +struct activation_random_test_params { + data_types input_type; + format::type input_format; + tensor input_size; + activation_func func_type; + activation_additional_params additional_params; + padding padd; +}; + +struct activation_random_test : testing::TestWithParam +{ + bool enable_profiling = false; + + size_t get_x_pitch(layout& layout) { + auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0)); + auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0)); + auto x0 = layout.get_linear_offset(tensor_x0); + auto x1 = layout.get_linear_offset(tensor_x1); + return (x1 - x0); + } + + template + void fill_random_typed(memory::ptr mem, int min, int max, int k) { + auto size = mem->get_layout().size; + size_t b = size.batch[0]; + size_t f = size.feature[0]; + size_t x = size.spatial[0]; + size_t y = size.spatial[1]; + + auto data = generate_random_4d(b, f, y, x, min, max, k); + mem_lock ptr{mem, get_test_stream()}; + for (size_t bi = 0; bi < b; ++bi) { + for (size_t fi = 0; fi < f; ++fi) { + for (size_t yi = 0; yi < y; ++yi) { + for (size_t xi = 0; xi < x; ++xi) { + auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0)); + auto offset = mem->get_layout().get_linear_offset(coords); + ptr[offset] = data[bi][fi][yi][xi]; + } + } + } + } + } + + void fill_random(memory::ptr mem) { + auto dt = mem->get_layout().data_type; + switch (dt) { + case data_types::f32: + fill_random_typed(mem, -127, 127, 2); + break; + case data_types::f16: + fill_random_typed(mem, -127, 127, 2); + break; + case data_types::i8: + fill_random_typed(mem, -127, 127, 1); + break; + case data_types::u8: + fill_random_typed(mem, 0, 255, 1); + break; + default: + break; + } + } + + template + bool compare_outputs(const memory::ptr out_ref, const memory::ptr out_opt) { + auto output_lay = out_ref->get_layout(); + auto opt_output_lay = out_opt->get_layout(); + size_t b = output_lay.size.batch[0]; + size_t f = output_lay.size.feature[0]; + size_t x = output_lay.size.spatial[0]; + size_t y = output_lay.size.spatial[1]; + cldnn::mem_lock ref_ptr(out_ref, get_test_stream()); + cldnn::mem_lock opt_ptr(out_opt, get_test_stream()); + + auto ref_x_pitch = get_x_pitch(output_lay); + auto opt_x_pitch = get_x_pitch(opt_output_lay); + + for (size_t bi = 0; bi < b; ++bi) { + for (size_t fi = 0; fi < f; ++fi) { + for (size_t yi = 0; yi < y; ++yi) { + auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(0, yi, 0, 0)); + auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords); + auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords); + for (size_t xi = 0; xi < x; ++xi) { + auto ref_out_val = ref_ptr[ref_out_offset + xi * ref_x_pitch]; + auto opt_out_val = opt_ptr[opt_out_offset + xi * opt_x_pitch]; + EXPECT_EQ(ref_out_val, opt_out_val); + } + } + } + } + + return true; + } + + void execute_compare(const activation_random_test_params& params, bool check_result) { + auto& engine = get_test_engine(); + + auto in_layout = layout(params.input_type, format::bfyx, params.input_size); + auto in_mem = engine.allocate_memory(in_layout); + fill_random(in_mem); + + /// bfyx + cldnn::topology topo; + topo.add(input_layout("in", in_layout)); + auto prim = activation("activation", "in", params.func_type); + prim.additional_params = params.additional_params; + topo.add(prim); + + auto build_opts = build_options(); + build_opts.set_option(build_option::outputs({"activation"})); + + network net(engine, topo, build_opts); + net.set_input_data("in", in_mem); + + // first execution of ref + auto result = net.execute(); + auto output = result.at("activation").get_memory(); + + cldnn::topology topo_opt; + topo_opt.add(input_layout("in", in_layout)); + topo_opt.add(reorder("in_to_input_type", "in", params.input_format, params.input_type)); + auto prim_opt = activation("activation_blocked", "in_to_input_type", params.func_type); + prim_opt.additional_params = params.additional_params; + topo_opt.add(prim_opt); + // force output format to input format. + topo_opt.add(reorder("res_to_input_format", "activation_blocked", params.input_format, params.input_type)); + + auto build_opts_opt = build_options(); + build_opts_opt.set_option(build_option::outputs({"activation_blocked", "res_to_input_format"})); + auto activation_impl_desc = implementation_desc(); + activation_impl_desc.output_format = params.input_format; + build_opts_opt.set_option(build_option::force_implementations({{"activation_blocked", {params.input_format, "activation_ref"} }})); + + network net_opt(engine, topo_opt, build_opts_opt); + + // Use in_mem from ref network + net_opt.set_input_data("in", in_mem); + + // first execution of opt + auto result_opt = net_opt.execute(); + auto output_opt = result_opt.at("res_to_input_format").get_memory(); + + if (check_result == true) { + // Check data_types + if (params.input_type == data_types::f32) { + compare_outputs(output, output_opt); + } else if (params.input_type == data_types::f16) { + compare_outputs(output, output_opt); + } else if (params.input_type == data_types::i8) { + compare_outputs(output, output_opt); + } else if (params.input_type == data_types::u8) { + compare_outputs(output, output_opt); + } else { + FAIL() << "Not supported data type: " << static_cast(params.input_type); + } + } + } +}; + +TEST_P(activation_random_test, random) { + auto param = GetParam(); + execute_compare(param, true); +} + +INSTANTIATE_TEST_SUITE_P(activation_blocked_tests, + activation_random_test, + testing::ValuesIn( + std::vector{ + { data_types::i8, format::b_fs_yx_fsv32, { 1, 32, 5, 5}, activation_func::relu, {}, {}}, + { data_types::i8, format::bs_fs_yx_bsv32_fsv32, {32, 32, 5, 5}, activation_func::relu, {}, {}}, + { data_types::f16, format::bs_fs_yx_bsv32_fsv16, {32, 32, 5, 5}, activation_func::relu, {}, {}}, + { data_types::i8, format::bs_fs_yx_bsv32_fsv32, {16, 16, 5, 5}, activation_func::relu, {}, {}}, + { data_types::f16, format::bs_fs_yx_bsv32_fsv16, {16, 16, 5, 5}, activation_func::relu, {}, {}}, + } + ));