[GPU] Merge activation_ref kernel update. (#7767)

+ Support b_fs_yx_fsv32, bs_fs_yx_bsv32_fsv32 and bs_fs_yx_bsv32_fsv16 format.
This commit is contained in:
Jade Cho
2021-10-06 12:38:54 +09:00
committed by GitHub
parent dffe4a4251
commit d29648164a
2 changed files with 220 additions and 32 deletions

View File

@@ -33,43 +33,53 @@ KERNEL(activation)(
const unsigned x = get_global_id(0);
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
#if OUTPUT_BATCH_NUM == 1
const unsigned feature = (uint)get_global_id(2);
const unsigned batch = 0;
#if OUTPUT_BATCH_NUM == 1
const unsigned feature = (uint)get_global_id(2);
const unsigned batch = 0;
#else
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
#endif
#else
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
#endif
#else
#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16
const unsigned x = (uint)get_global_id(1);
const unsigned y = (uint)get_global_id(2);
#define z 0
#if OUTPUT_BATCH_NUM == 1
const unsigned feature = (uint)get_global_id(0);
const unsigned batch = 0;
#else
const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM;
#endif
#else
#define z 0
const unsigned x = (uint)get_global_id(0);
const unsigned y = (uint)get_global_id(1);
#if OUTPUT_BATCH_NUM == 1
const unsigned feature = (uint)get_global_id(2);
const unsigned batch = 0;
#else
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
#endif
#endif
#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32
const unsigned x = (uint)get_global_id(1);
const unsigned y = (uint)get_global_id(2);
#define z 0
#if OUTPUT_BATCH_NUM == 1
const unsigned feature = (uint)get_global_id(0);
const unsigned batch = 0;
#else
const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM;
#endif
#elif defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32 || defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16
const unsigned x = (uint)get_global_id(0) % OUTPUT_SIZE_X;
const unsigned y = (uint)get_global_id(0) / OUTPUT_SIZE_X;
const unsigned feature = (uint)get_global_id(1);
const unsigned batch = (uint)get_global_id(2);
#else
#define z 0
const unsigned x = (uint)get_global_id(0);
const unsigned y = (uint)get_global_id(1);
#if OUTPUT_BATCH_NUM == 1
const unsigned feature = (uint)get_global_id(2);
const unsigned batch = 0;
#else
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
#endif
#endif
#endif
#if defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) && OUTPUT_FEATURE_NUM % 16 != 0
// b_fs_yx_fsv16 has dispatch features aligned to multiple of 16
// GWS.feature and GWS.batch is aligned to 16. Otherwise, there are some idling WIs.
#if (defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) || defined(OUTPUT_LAYOUT_B_FS_YX_FSV32)) \
&& OUTPUT_FEATURE_NUM % 16 != 0
if (feature >= OUTPUT_FEATURE_NUM)
return;
#elif (defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16) || defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32)) \
&& (OUTPUT_FEATURE_NUM % 16 != 0 || OUTPUT_BATCH_NUM % 16 != 0)
if (batch >= OUTPUT_BATCH_NUM || feature >= OUTPUT_FEATURE_NUM)
return;
#endif
const unsigned src_index = GET_INDEX(INPUT,0,ORDER);

View File

@@ -1459,3 +1459,181 @@ TEST(activation_f32_fw_gpu, b_fs_yx_fsv16_prelu) {
EXPECT_EQ(expected[i], out_ptr[i]) << "at i=" << i;
}
}
struct activation_random_test_params {
data_types input_type;
format::type input_format;
tensor input_size;
activation_func func_type;
activation_additional_params additional_params;
padding padd;
};
struct activation_random_test : testing::TestWithParam<activation_random_test_params>
{
bool enable_profiling = false;
size_t get_x_pitch(layout& layout) {
auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
auto x0 = layout.get_linear_offset(tensor_x0);
auto x1 = layout.get_linear_offset(tensor_x1);
return (x1 - x0);
}
template <typename T>
void fill_random_typed(memory::ptr mem, int min, int max, int k) {
auto size = mem->get_layout().size;
size_t b = size.batch[0];
size_t f = size.feature[0];
size_t x = size.spatial[0];
size_t y = size.spatial[1];
auto data = generate_random_4d<T>(b, f, y, x, min, max, k);
mem_lock<T> ptr{mem, get_test_stream()};
for (size_t bi = 0; bi < b; ++bi) {
for (size_t fi = 0; fi < f; ++fi) {
for (size_t yi = 0; yi < y; ++yi) {
for (size_t xi = 0; xi < x; ++xi) {
auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
auto offset = mem->get_layout().get_linear_offset(coords);
ptr[offset] = data[bi][fi][yi][xi];
}
}
}
}
}
void fill_random(memory::ptr mem) {
auto dt = mem->get_layout().data_type;
switch (dt) {
case data_types::f32:
fill_random_typed<float>(mem, -127, 127, 2);
break;
case data_types::f16:
fill_random_typed<FLOAT16>(mem, -127, 127, 2);
break;
case data_types::i8:
fill_random_typed<int8_t>(mem, -127, 127, 1);
break;
case data_types::u8:
fill_random_typed<uint8_t>(mem, 0, 255, 1);
break;
default:
break;
}
}
template <typename T>
bool compare_outputs(const memory::ptr out_ref, const memory::ptr out_opt) {
auto output_lay = out_ref->get_layout();
auto opt_output_lay = out_opt->get_layout();
size_t b = output_lay.size.batch[0];
size_t f = output_lay.size.feature[0];
size_t x = output_lay.size.spatial[0];
size_t y = output_lay.size.spatial[1];
cldnn::mem_lock<T> ref_ptr(out_ref, get_test_stream());
cldnn::mem_lock<T> opt_ptr(out_opt, get_test_stream());
auto ref_x_pitch = get_x_pitch(output_lay);
auto opt_x_pitch = get_x_pitch(opt_output_lay);
for (size_t bi = 0; bi < b; ++bi) {
for (size_t fi = 0; fi < f; ++fi) {
for (size_t yi = 0; yi < y; ++yi) {
auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(0, yi, 0, 0));
auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords);
auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords);
for (size_t xi = 0; xi < x; ++xi) {
auto ref_out_val = ref_ptr[ref_out_offset + xi * ref_x_pitch];
auto opt_out_val = opt_ptr[opt_out_offset + xi * opt_x_pitch];
EXPECT_EQ(ref_out_val, opt_out_val);
}
}
}
}
return true;
}
void execute_compare(const activation_random_test_params& params, bool check_result) {
auto& engine = get_test_engine();
auto in_layout = layout(params.input_type, format::bfyx, params.input_size);
auto in_mem = engine.allocate_memory(in_layout);
fill_random(in_mem);
/// bfyx
cldnn::topology topo;
topo.add(input_layout("in", in_layout));
auto prim = activation("activation", "in", params.func_type);
prim.additional_params = params.additional_params;
topo.add(prim);
auto build_opts = build_options();
build_opts.set_option(build_option::outputs({"activation"}));
network net(engine, topo, build_opts);
net.set_input_data("in", in_mem);
// first execution of ref
auto result = net.execute();
auto output = result.at("activation").get_memory();
cldnn::topology topo_opt;
topo_opt.add(input_layout("in", in_layout));
topo_opt.add(reorder("in_to_input_type", "in", params.input_format, params.input_type));
auto prim_opt = activation("activation_blocked", "in_to_input_type", params.func_type);
prim_opt.additional_params = params.additional_params;
topo_opt.add(prim_opt);
// force output format to input format.
topo_opt.add(reorder("res_to_input_format", "activation_blocked", params.input_format, params.input_type));
auto build_opts_opt = build_options();
build_opts_opt.set_option(build_option::outputs({"activation_blocked", "res_to_input_format"}));
auto activation_impl_desc = implementation_desc();
activation_impl_desc.output_format = params.input_format;
build_opts_opt.set_option(build_option::force_implementations({{"activation_blocked", {params.input_format, "activation_ref"} }}));
network net_opt(engine, topo_opt, build_opts_opt);
// Use in_mem from ref network
net_opt.set_input_data("in", in_mem);
// first execution of opt
auto result_opt = net_opt.execute();
auto output_opt = result_opt.at("res_to_input_format").get_memory();
if (check_result == true) {
// Check data_types
if (params.input_type == data_types::f32) {
compare_outputs<float>(output, output_opt);
} else if (params.input_type == data_types::f16) {
compare_outputs<FLOAT16>(output, output_opt);
} else if (params.input_type == data_types::i8) {
compare_outputs<int8_t>(output, output_opt);
} else if (params.input_type == data_types::u8) {
compare_outputs<uint8_t>(output, output_opt);
} else {
FAIL() << "Not supported data type: " << static_cast<size_t>(params.input_type);
}
}
}
};
TEST_P(activation_random_test, random) {
auto param = GetParam();
execute_compare(param, true);
}
INSTANTIATE_TEST_SUITE_P(activation_blocked_tests,
activation_random_test,
testing::ValuesIn(
std::vector<activation_random_test_params>{
{ data_types::i8, format::b_fs_yx_fsv32, { 1, 32, 5, 5}, activation_func::relu, {}, {}},
{ data_types::i8, format::bs_fs_yx_bsv32_fsv32, {32, 32, 5, 5}, activation_func::relu, {}, {}},
{ data_types::f16, format::bs_fs_yx_bsv32_fsv16, {32, 32, 5, 5}, activation_func::relu, {}, {}},
{ data_types::i8, format::bs_fs_yx_bsv32_fsv32, {16, 16, 5, 5}, activation_func::relu, {}, {}},
{ data_types::f16, format::bs_fs_yx_bsv32_fsv16, {16, 16, 5, 5}, activation_func::relu, {}, {}},
}
));