[GPU] Merge activation_ref kernel update. (#7767)
+ Support b_fs_yx_fsv32, bs_fs_yx_bsv32_fsv32 and bs_fs_yx_bsv32_fsv16 format.
This commit is contained in:
@@ -33,43 +33,53 @@ KERNEL(activation)(
|
||||
const unsigned x = get_global_id(0);
|
||||
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
|
||||
const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const unsigned feature = (uint)get_global_id(2);
|
||||
const unsigned batch = 0;
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const unsigned feature = (uint)get_global_id(2);
|
||||
const unsigned batch = 0;
|
||||
#else
|
||||
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#else
|
||||
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#else
|
||||
#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16
|
||||
const unsigned x = (uint)get_global_id(1);
|
||||
const unsigned y = (uint)get_global_id(2);
|
||||
#define z 0
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const unsigned feature = (uint)get_global_id(0);
|
||||
const unsigned batch = 0;
|
||||
#else
|
||||
const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#else
|
||||
#define z 0
|
||||
const unsigned x = (uint)get_global_id(0);
|
||||
const unsigned y = (uint)get_global_id(1);
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const unsigned feature = (uint)get_global_id(2);
|
||||
const unsigned batch = 0;
|
||||
#else
|
||||
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#endif
|
||||
#if defined OUTPUT_LAYOUT_YXFB || defined OUTPUT_LAYOUT_B_FS_YX_FSV16 || defined OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
const unsigned x = (uint)get_global_id(1);
|
||||
const unsigned y = (uint)get_global_id(2);
|
||||
#define z 0
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const unsigned feature = (uint)get_global_id(0);
|
||||
const unsigned batch = 0;
|
||||
#else
|
||||
const unsigned feature = (uint)get_global_id(0) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(0) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#elif defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32 || defined OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16
|
||||
const unsigned x = (uint)get_global_id(0) % OUTPUT_SIZE_X;
|
||||
const unsigned y = (uint)get_global_id(0) / OUTPUT_SIZE_X;
|
||||
const unsigned feature = (uint)get_global_id(1);
|
||||
const unsigned batch = (uint)get_global_id(2);
|
||||
#else
|
||||
#define z 0
|
||||
const unsigned x = (uint)get_global_id(0);
|
||||
const unsigned y = (uint)get_global_id(1);
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const unsigned feature = (uint)get_global_id(2);
|
||||
const unsigned batch = 0;
|
||||
#else
|
||||
const unsigned feature = (uint)get_global_id(2) % OUTPUT_FEATURE_NUM;
|
||||
const unsigned batch = (uint)get_global_id(2) / OUTPUT_FEATURE_NUM;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) && OUTPUT_FEATURE_NUM % 16 != 0
|
||||
// b_fs_yx_fsv16 has dispatch features aligned to multiple of 16
|
||||
// GWS.feature and GWS.batch is aligned to 16. Otherwise, there are some idling WIs.
|
||||
#if (defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) || defined(OUTPUT_LAYOUT_B_FS_YX_FSV32)) \
|
||||
&& OUTPUT_FEATURE_NUM % 16 != 0
|
||||
if (feature >= OUTPUT_FEATURE_NUM)
|
||||
return;
|
||||
#elif (defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16) || defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32)) \
|
||||
&& (OUTPUT_FEATURE_NUM % 16 != 0 || OUTPUT_BATCH_NUM % 16 != 0)
|
||||
if (batch >= OUTPUT_BATCH_NUM || feature >= OUTPUT_FEATURE_NUM)
|
||||
return;
|
||||
#endif
|
||||
|
||||
const unsigned src_index = GET_INDEX(INPUT,0,ORDER);
|
||||
|
||||
@@ -1459,3 +1459,181 @@ TEST(activation_f32_fw_gpu, b_fs_yx_fsv16_prelu) {
|
||||
EXPECT_EQ(expected[i], out_ptr[i]) << "at i=" << i;
|
||||
}
|
||||
}
|
||||
|
||||
struct activation_random_test_params {
|
||||
data_types input_type;
|
||||
format::type input_format;
|
||||
tensor input_size;
|
||||
activation_func func_type;
|
||||
activation_additional_params additional_params;
|
||||
padding padd;
|
||||
};
|
||||
|
||||
struct activation_random_test : testing::TestWithParam<activation_random_test_params>
|
||||
{
|
||||
bool enable_profiling = false;
|
||||
|
||||
size_t get_x_pitch(layout& layout) {
|
||||
auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
|
||||
auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
|
||||
auto x0 = layout.get_linear_offset(tensor_x0);
|
||||
auto x1 = layout.get_linear_offset(tensor_x1);
|
||||
return (x1 - x0);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void fill_random_typed(memory::ptr mem, int min, int max, int k) {
|
||||
auto size = mem->get_layout().size;
|
||||
size_t b = size.batch[0];
|
||||
size_t f = size.feature[0];
|
||||
size_t x = size.spatial[0];
|
||||
size_t y = size.spatial[1];
|
||||
|
||||
auto data = generate_random_4d<T>(b, f, y, x, min, max, k);
|
||||
mem_lock<T> ptr{mem, get_test_stream()};
|
||||
for (size_t bi = 0; bi < b; ++bi) {
|
||||
for (size_t fi = 0; fi < f; ++fi) {
|
||||
for (size_t yi = 0; yi < y; ++yi) {
|
||||
for (size_t xi = 0; xi < x; ++xi) {
|
||||
auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
|
||||
auto offset = mem->get_layout().get_linear_offset(coords);
|
||||
ptr[offset] = data[bi][fi][yi][xi];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill_random(memory::ptr mem) {
|
||||
auto dt = mem->get_layout().data_type;
|
||||
switch (dt) {
|
||||
case data_types::f32:
|
||||
fill_random_typed<float>(mem, -127, 127, 2);
|
||||
break;
|
||||
case data_types::f16:
|
||||
fill_random_typed<FLOAT16>(mem, -127, 127, 2);
|
||||
break;
|
||||
case data_types::i8:
|
||||
fill_random_typed<int8_t>(mem, -127, 127, 1);
|
||||
break;
|
||||
case data_types::u8:
|
||||
fill_random_typed<uint8_t>(mem, 0, 255, 1);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool compare_outputs(const memory::ptr out_ref, const memory::ptr out_opt) {
|
||||
auto output_lay = out_ref->get_layout();
|
||||
auto opt_output_lay = out_opt->get_layout();
|
||||
size_t b = output_lay.size.batch[0];
|
||||
size_t f = output_lay.size.feature[0];
|
||||
size_t x = output_lay.size.spatial[0];
|
||||
size_t y = output_lay.size.spatial[1];
|
||||
cldnn::mem_lock<T> ref_ptr(out_ref, get_test_stream());
|
||||
cldnn::mem_lock<T> opt_ptr(out_opt, get_test_stream());
|
||||
|
||||
auto ref_x_pitch = get_x_pitch(output_lay);
|
||||
auto opt_x_pitch = get_x_pitch(opt_output_lay);
|
||||
|
||||
for (size_t bi = 0; bi < b; ++bi) {
|
||||
for (size_t fi = 0; fi < f; ++fi) {
|
||||
for (size_t yi = 0; yi < y; ++yi) {
|
||||
auto ref_out_coords = tensor(batch(bi), feature(fi), spatial(0, yi, 0, 0));
|
||||
auto ref_out_offset = output_lay.get_linear_offset(ref_out_coords);
|
||||
auto opt_out_offset = opt_output_lay.get_linear_offset(ref_out_coords);
|
||||
for (size_t xi = 0; xi < x; ++xi) {
|
||||
auto ref_out_val = ref_ptr[ref_out_offset + xi * ref_x_pitch];
|
||||
auto opt_out_val = opt_ptr[opt_out_offset + xi * opt_x_pitch];
|
||||
EXPECT_EQ(ref_out_val, opt_out_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void execute_compare(const activation_random_test_params& params, bool check_result) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto in_layout = layout(params.input_type, format::bfyx, params.input_size);
|
||||
auto in_mem = engine.allocate_memory(in_layout);
|
||||
fill_random(in_mem);
|
||||
|
||||
/// bfyx
|
||||
cldnn::topology topo;
|
||||
topo.add(input_layout("in", in_layout));
|
||||
auto prim = activation("activation", "in", params.func_type);
|
||||
prim.additional_params = params.additional_params;
|
||||
topo.add(prim);
|
||||
|
||||
auto build_opts = build_options();
|
||||
build_opts.set_option(build_option::outputs({"activation"}));
|
||||
|
||||
network net(engine, topo, build_opts);
|
||||
net.set_input_data("in", in_mem);
|
||||
|
||||
// first execution of ref
|
||||
auto result = net.execute();
|
||||
auto output = result.at("activation").get_memory();
|
||||
|
||||
cldnn::topology topo_opt;
|
||||
topo_opt.add(input_layout("in", in_layout));
|
||||
topo_opt.add(reorder("in_to_input_type", "in", params.input_format, params.input_type));
|
||||
auto prim_opt = activation("activation_blocked", "in_to_input_type", params.func_type);
|
||||
prim_opt.additional_params = params.additional_params;
|
||||
topo_opt.add(prim_opt);
|
||||
// force output format to input format.
|
||||
topo_opt.add(reorder("res_to_input_format", "activation_blocked", params.input_format, params.input_type));
|
||||
|
||||
auto build_opts_opt = build_options();
|
||||
build_opts_opt.set_option(build_option::outputs({"activation_blocked", "res_to_input_format"}));
|
||||
auto activation_impl_desc = implementation_desc();
|
||||
activation_impl_desc.output_format = params.input_format;
|
||||
build_opts_opt.set_option(build_option::force_implementations({{"activation_blocked", {params.input_format, "activation_ref"} }}));
|
||||
|
||||
network net_opt(engine, topo_opt, build_opts_opt);
|
||||
|
||||
// Use in_mem from ref network
|
||||
net_opt.set_input_data("in", in_mem);
|
||||
|
||||
// first execution of opt
|
||||
auto result_opt = net_opt.execute();
|
||||
auto output_opt = result_opt.at("res_to_input_format").get_memory();
|
||||
|
||||
if (check_result == true) {
|
||||
// Check data_types
|
||||
if (params.input_type == data_types::f32) {
|
||||
compare_outputs<float>(output, output_opt);
|
||||
} else if (params.input_type == data_types::f16) {
|
||||
compare_outputs<FLOAT16>(output, output_opt);
|
||||
} else if (params.input_type == data_types::i8) {
|
||||
compare_outputs<int8_t>(output, output_opt);
|
||||
} else if (params.input_type == data_types::u8) {
|
||||
compare_outputs<uint8_t>(output, output_opt);
|
||||
} else {
|
||||
FAIL() << "Not supported data type: " << static_cast<size_t>(params.input_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(activation_random_test, random) {
|
||||
auto param = GetParam();
|
||||
execute_compare(param, true);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(activation_blocked_tests,
|
||||
activation_random_test,
|
||||
testing::ValuesIn(
|
||||
std::vector<activation_random_test_params>{
|
||||
{ data_types::i8, format::b_fs_yx_fsv32, { 1, 32, 5, 5}, activation_func::relu, {}, {}},
|
||||
{ data_types::i8, format::bs_fs_yx_bsv32_fsv32, {32, 32, 5, 5}, activation_func::relu, {}, {}},
|
||||
{ data_types::f16, format::bs_fs_yx_bsv32_fsv16, {32, 32, 5, 5}, activation_func::relu, {}, {}},
|
||||
{ data_types::i8, format::bs_fs_yx_bsv32_fsv32, {16, 16, 5, 5}, activation_func::relu, {}, {}},
|
||||
{ data_types::f16, format::bs_fs_yx_bsv32_fsv16, {16, 16, 5, 5}, activation_func::relu, {}, {}},
|
||||
}
|
||||
));
|
||||
|
||||
Reference in New Issue
Block a user