[IE CLDNN] Optimize reduce fsv16 primitive when out_x=out_y=1 (#4261)

This commit is contained in:
Jade Cho 2021-02-24 16:25:36 +09:00 committed by GitHub
parent 7420bb6cb0
commit 54a7ecb725
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 280 additions and 10 deletions

View File

@ -23,6 +23,9 @@
namespace kernel_selector {
static const size_t SIMD = 16;
static const size_t XY_OPT_F_LIMITS = 96;
static const size_t AXIS_Y = 2;
static const size_t AXIS_X = 3;
using NDims = std::vector<kernel_selector::Tensor::Dim>;
static size_t calc_read_offset(const reduce_params& params) {
@ -36,6 +39,13 @@ static size_t calc_read_offset(const reduce_params& params) {
return read_offset;
}
static NDims get_input_dims(const reduce_params& params) {
auto input = params.inputs[0];
auto in_dims = input.GetDims();
std::reverse(in_dims.begin(), in_dims.end());
return in_dims;
}
static NDims calc_in_dims(const reduce_params& params) {
auto input = params.inputs[0];
auto in_dims = input.GetDims();
@ -50,6 +60,36 @@ static NDims calc_in_dims(const reduce_params& params) {
return in_dims;
}
static bool is_xy_opt_supported(const ReduceMode& mode) {
switch(mode) {
case ReduceMode::MAX:
case ReduceMode::MIN:
case ReduceMode::MEAN:
case ReduceMode::SUM:
case ReduceMode::AND:
case ReduceMode::OR:
case ReduceMode::L1:
case ReduceMode::LOG_SUM_EXP:
return true;
// prod, sum_squre, L2 and log_sum doesn't work with reduce(x,y) optimization.
case ReduceMode::PROD:
case ReduceMode::SUM_SQUARE:
case ReduceMode::L2:
case ReduceMode::LOG_SUM:
default:
return false;
}
}
static bool can_opt_reduce_xy(const reduce_params& params) {
auto axes = params.reduceAxes;
auto input_dims = get_input_dims(params);
return is_xy_opt_supported(params.reduceMode) && axes.size() == 2 &&
std::find(axes.begin(), axes.end(), AXIS_Y) != std::end(axes) &&
std::find(axes.begin(), axes.end(), AXIS_X) != std::end(axes) &&
input_dims[1].v <= XY_OPT_F_LIMITS;
}
ParamsKey ReduceKernel_b_fs_yx_fsv16::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
@ -75,10 +115,19 @@ CommonDispatchData ReduceKernel_b_fs_yx_fsv16::SetDefault(const reduce_params& p
CommonDispatchData dispatchData;
auto in_dims = calc_in_dims(params);
dispatchData.gws = { 16,
if (can_opt_reduce_xy(params)) {
auto input_dims = get_input_dims(params);
dispatchData.gws = { 16,
std::min(CeilDiv(input_dims[2].v, SIMD), SIMD),
CeilDiv(in_dims[1].v, SIMD) * in_dims[0].v }; // F, B
dispatchData.lws = { 16, dispatchData.gws[1], 1 };
} else {
dispatchData.gws = { 16,
CeilDiv(in_dims[3].v, calc_read_offset(params)) * in_dims[2].v, // X, Y
CeilDiv(in_dims[1].v, SIMD) * in_dims[0].v }; // F, B
dispatchData.lws = { SIMD, 1, 1 };
dispatchData.lws = { SIMD, 1, 1 };
}
return dispatchData;
}
@ -88,6 +137,19 @@ JitConstants ReduceKernel_b_fs_yx_fsv16::GetJitConstants(const reduce_params& pa
auto in_dims = calc_in_dims(params);
auto read_offset = calc_read_offset(params);
// Optimization of reduce(x,y) when feature depth is shallow.
// In this case, tile the input tensor and create partial result to generate more work items
if (can_opt_reduce_xy(params)) {
auto input_dims = get_input_dims(params);
auto num_block_y = std::min(CeilDiv(input_dims[2].v, SIMD), SIMD);
jit.AddConstant(MakeJitConstant("IS_REDUCE_XY", 1));
jit.AddConstant(MakeJitConstant("BLOCK_Y_NUM", num_block_y));
jit.AddConstant(MakeJitConstant("BLOCK_Y_SIZE", CeilDiv(input_dims[2].v, num_block_y)));
}
else {
jit.AddConstant(MakeJitConstant("IS_REDUCE_XY", 0));
}
// Universal output sizes for keep dims = true/false cases
jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_SIZE_X", in_dims[3].v));
jit.AddConstant(MakeJitConstant("COMMON_OUTPUT_SIZE_Y", in_dims[2].v));

View File

@ -160,9 +160,17 @@ KERNEL(reduce_fsv16)(
#endif
)
{
#if IS_REDUCE_XY
__local ACCUMULATOR_TYPE lg_storage[SIMD][BLOCK_Y_NUM];
const uint lid0 = (uint)get_local_id(0);
const uint lid1 = (uint)get_local_id(1);
const uint x = 0;
const uint y = 0;
#else
const uint xy = (uint)get_global_id(1) * READ_OFFSET;
const uint x = xy % ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
const uint y = xy / ALIGN(COMMON_OUTPUT_SIZE_X, READ_OFFSET);
#endif
const uint bf = (uint)get_global_id(2) * SIMD;
const uint b = bf / ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
const uint f = bf % ALIGN(COMMON_OUTPUT_FEATURE_NUM, SIMD);
@ -225,8 +233,13 @@ KERNEL(reduce_fsv16)(
#endif
#if REDUCE_Y
#if IS_REDUCE_XY
const uint y_out = (uint)get_local_id(1) * BLOCK_Y_SIZE;
const uint y_max_val = min((uint)(y_out + BLOCK_Y_SIZE), (uint)INPUT0_SIZE_Y);
#else
const uint y_out = 0;
const uint y_max_val = INPUT0_SIZE_Y;
#endif
#else
const uint y_out = SIZE_Y_IDX_COMP(linear_idx);
const uint y_max_val = y_out + 1;
@ -290,10 +303,30 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) *
offset += input_batch_pitch - ((((feature_max_val - feature_out) + FSV - 1) / FSV) * input_fs_pitch);
}
#if IS_REDUCE_XY
lg_storage[lid0][lid1] = acc;
barrier(CLK_LOCAL_MEM_FENCE);
if (lid1 != 0)
return;
#if REDUCE_SUM_SQUARE_MODE || REDUCE_L2_MODE || REDUCE_LOG_SUM_MODE || REDUCE_LOG_SUM_EXP_MODE
acc = INIT_VAL;
unroll_for (uint i = 0; i < BLOCK_Y_NUM; i++) {
acc += lg_storage[lid0][i];
}
#else
acc = lg_storage[lid0][0];
unroll_for (uint i = 1; i < BLOCK_Y_NUM; i++) {
acc = FUNC_CALL(apply_reduce)(acc, lg_storage[lid0][i]);
}
#endif
#endif
FINAL_ACCUMULATOR_TYPE final_acc;
acc = FUNC_CALL(sub_group_reduce)(acc);
final_acc = FUNC_CALL(final_reduce)(TO_FINAL_ACCUMULATOR_TYPE(acc));
OUTPUT_TYPE final_result;
ACTIVATION_TYPE reduce_result = TO_ACTIVATION_TYPE(final_acc);
#if HAS_FUSED_OPS

View File

@ -4551,7 +4551,7 @@ TEST_P(deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, basic) {
reorder("out", "quant2", p.default_format, data_types::f32)
);
tolerance = 1.f;
tolerance = 1.0f;
execute(p);
}
@ -4599,7 +4599,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_
deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 9 },
deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 9 },
deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 9 },
deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 9 },
// deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 9 },
deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 9 },
// deconv_test_params{ CASE_DECONV_FP32_3D_9, 6, 9 },

View File

@ -443,6 +443,8 @@ protected:
cldnn::data_types output_dt;
bool force_output_dt;
static std::vector<std::tuple<cldnn::reduce_mode,double, double, double>> perf_data;
ReduceTestBase() {
this->batch_num = testing::get<0>(GetParam());
this->input_f = testing::get<1>(GetParam());
@ -541,7 +543,11 @@ public:
std::cout << "Reference value at batch: " << bi << " output_f: " << fi
<< " y: " << yi << " x: " << xi << " = " << val_ref << " Val = " << val
<< std::endl;
EXPECT_TRUE(equal);
if (!equal)
break;
}
}
}
@ -556,8 +562,7 @@ TEST_P(general_reduce_gpu_i8_f32, base) { execute(); }
class general_reduce_gpu_f32_f32 : public ReduceTestBase<data_types::f32, data_types::f32> {};
TEST_P(general_reduce_gpu_f32_f32, base) { execute(); }
INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_i8_i8,
INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_i8_i8,
general_reduce_gpu_i8_i8,
::testing::Values(
TestParamType_general_reduce_gpu(2, 12, 1, 1, 3, 2, format::b_fs_yx_fsv16, reduce_mode::logical_or, {reduce::along_y, reduce::along_x}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
@ -1611,3 +1616,173 @@ TEST(reduce_gpu, common_bfwzyx_log_sum_exp_keepdims) {
EXPECT_TRUE(are_equal(ref_data[i], output_ptr[i]));
}
}
template <data_types InputT, data_types OutputT>
class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParam<TestParamType_general_reduce_gpu> {
protected:
cldnn::engine engine = get_test_engine();
int batch_num, input_f, input_w, input_z, input_y, input_x;
cldnn::format input_format = format::any;
cldnn::reduce_mode reduce_mode;
std::vector<uint16_t> reduce_axis;
std::string kernel_name;
bool keep_dims;
cldnn::data_types input_dt;
cldnn::data_types output_dt;
bool force_output_dt;
static std::vector<std::tuple<cldnn::reduce_mode,double, double, double>> perf_data;
ReduceXYWithBigTensorTestBase() {
this->batch_num = testing::get<0>(GetParam());
this->input_f = testing::get<1>(GetParam());
this->input_w = testing::get<2>(GetParam());
this->input_z = testing::get<3>(GetParam());
this->input_y = testing::get<4>(GetParam());
this->input_x = testing::get<5>(GetParam());
this->input_format = testing::get<6>(GetParam());
this->reduce_mode = testing::get<7>(GetParam()); // not used
this->reduce_axis = testing::get<8>(GetParam());
this->kernel_name = testing::get<9>(GetParam());
this->keep_dims = testing::get<10>(GetParam());
this->input_dt = testing::get<11>(GetParam());
this->force_output_dt = testing::get<12>(GetParam());
this->output_dt = testing::get<13>(GetParam());
}
public:
void execute() {
int input_dim = static_cast<int>(input_format.dimension());
cldnn::format layout_format = input_format;
if (input_dim == 4)
layout_format = format::bfyx;
else if (input_dim == 5)
layout_format = format::bfzyx;
else
layout_format = format::bfwzyx;
using input_t = typename input_data_type<InputT>::type;
using output_t = typename output_data_type<OutputT>::type;
auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y, input_z, input_w));
auto input_data = generate_random_6d<input_t>(batch_num, input_f, input_x, input_y, input_z, input_w, 1, 5, 9);
auto input_lay = layout(input_dt, layout_format, input_size);
auto input_mem = memory::allocate(engine, input_lay);
{
auto input_ptr = input_mem.pointer<input_t>();
for (int fi = 0; fi < input_f; fi++)
for (int wi = 0; wi < input_w; wi++)
for (int zi = 0; zi < input_z; zi++)
for (int yi = 0; yi < input_y; yi++)
for (int xi = 0; xi < input_x; xi++) {
for (int bi = 0; bi < batch_num; bi++) {
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, wi));
size_t offset = input_lay.get_linear_offset(coords);
input_ptr[offset] = input_data[bi][fi][xi][yi][zi][wi];
}
}
}
std::vector<cldnn::reduce_mode> modes {
cldnn::reduce_mode::max,
cldnn::reduce_mode::min,
cldnn::reduce_mode::mean,
// reduce_mode::prod,
cldnn::reduce_mode::sum,
cldnn::reduce_mode::logical_and,
cldnn::reduce_mode::logical_or,
// reduce_mode::sum_square,
cldnn::reduce_mode::l1,
// reduce_mode::l2,
// reduce_mode::log_sum,
cldnn::reduce_mode::log_sum_exp
};
for (auto& target_mode : modes)
{
auto reference_result = reference_reduce(input_data, target_mode, reduce_axis, batch_num,
input_f, input_w, input_z, input_y,
input_x, input_dim, keep_dims);
topology topology;
auto red = reduce("reduce", "input", target_mode, reduce_axis, keep_dims);
if (force_output_dt) {
red.output_data_type = output_dt;
}
topology.add(input_layout("input", input_mem.get_layout()));
topology.add(red);
build_options options;
options.set_option(build_option::optimize_data(true));
implementation_desc reduce_impl = {input_format, kernel_name};
options.set_option(build_option::force_implementations({{"reduce", reduce_impl}}));
network network(engine, topology, options);
network.set_input_data("input", input_mem);
network.execute();
auto out_mem = network.get_output("reduce").get_memory();
auto out_ptr = out_mem.pointer<output_t>();
auto out_lay = out_mem.get_layout();
ASSERT_EQ(out_lay.size.sizes()[0], reference_result.size()); // b
ASSERT_EQ(out_lay.size.sizes()[1], reference_result[0].size()); // f
ASSERT_EQ(out_lay.size.spatial[3], reference_result[0][0].size()); // w
ASSERT_EQ(out_lay.size.spatial[2], reference_result[0][0][0].size()); // z
ASSERT_EQ(out_lay.size.spatial[1], reference_result[0][0][0][0].size()); // y
ASSERT_EQ(out_lay.size.spatial[0], reference_result[0][0][0][0][0].size()); // x
bool need_adjust_threshold = (typeid(output_t) == typeid(output_data_type<data_types::i8>::type));
for (size_t bi = 0; bi < reference_result.size(); bi++)
for (size_t fi = 0; fi < reference_result[0].size(); fi++)
for (size_t wi = 0; wi < reference_result[0][0].size(); wi++)
for (size_t zi = 0; zi < reference_result[0][0][0].size(); zi++)
for (size_t yi = 0; yi < reference_result[0][0][0][0].size(); yi++) {
for (size_t xi = 0; xi < reference_result[0][0][0][0][0].size(); xi++) {
tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, zi, wi));
size_t offset = out_lay.get_linear_offset(coords);
auto val = out_ptr[offset];
auto val_ref = static_cast<output_t>(reference_result[bi][fi][wi][zi][yi][xi]);
bool equal = need_adjust_threshold ?
are_equal(val_ref, val, 1e-1f, 1.0f, 10.0f) : are_equal(val_ref, val, 1e-1f);
if (!equal)
std::cout << "Reduce mode: " << (int)target_mode << ", "
<< "Reference value at batch: " << bi << " output_f: " << fi
<< " y: " << yi << " x: " << xi << " = " << val_ref << " Val = " << val
<< std::endl;
EXPECT_TRUE(equal);
if (!equal)
break;
}
}
}
}
};
class general_reduce_gpu_xy_f32 : public ReduceXYWithBigTensorTestBase<data_types::f32, data_types::f32> {};
TEST_P(general_reduce_gpu_xy_f32, base) { execute(); }
class general_reduce_gpu_xy_i8 : public ReduceXYWithBigTensorTestBase<data_types::i8, data_types::i8> {};
TEST_P(general_reduce_gpu_xy_i8, base) { execute(); }
INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_xy_f32,
general_reduce_gpu_xy_f32,
::testing::Values(
TestParamType_general_reduce_gpu(1, 32, 1, 1, 18, 18, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_x, reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, true, data_types::f32),
TestParamType_general_reduce_gpu(1, 32, 1, 1, 256, 256, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_x, reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::f32, true, data_types::f32)
),
general_reduce_gpu::PrintToStringParamName);
INSTANTIATE_TEST_CASE_P(reduce_gpu_b_fs_yx_fsv16_xy_i8,
general_reduce_gpu_xy_i8,
::testing::Values(
TestParamType_general_reduce_gpu(1, 32, 1, 1, 18, 18, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_x, reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8),
TestParamType_general_reduce_gpu(1, 32, 1, 1, 256, 256, format::b_fs_yx_fsv16, reduce_mode::max, {reduce::along_x, reduce::along_y}, "reduce_gpu_b_fs_yx_fsv16", false, data_types::i8, true, data_types::i8)
),
general_reduce_gpu::PrintToStringParamName);

View File

@ -334,15 +334,15 @@ inline void check_exception_massage(const cldnn::engine& engine, cldnn::topology
// Default values:
// relative_error_threshold = 1e-3
// absolute_error_threshold = 1e-6
// absoulte_error_limit = 1e-4
// absolute_error_limit = 1e-4
inline bool are_equal(
const float ref_item,
const float item,
const float relative_error_threshold = 1e-3,
const float absolute_error_threshold = 1e-6,
const float absoulte_error_limit = 1e-4) {
const float absolute_error_limit = 1e-4) {
if( fabs(item) < absoulte_error_limit) {
if( fabs(item) < absolute_error_limit) {
if(fabs( item - ref_item ) > absolute_error_threshold) {
std::cout << "Ref val: " << ref_item << "\tSecond val: " << item << std::endl;
return false;