[IE CLDNN] Eltwise b_fs_zyx_fsv16 optimization (#5098)

This commit is contained in:
Roman Lyamin 2021-04-26 15:36:38 +03:00 committed by GitHub
parent 9f2a3d0edc
commit 16d7d01a27
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 13 deletions

View File

@ -28,7 +28,9 @@ ParamsKey EltwiseKernel_b_fs_yx_fsv16::GetSupportedKey() const {
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableDifferentTypes();
k.EnableBatching();
k.EnableTensorPitches();
@ -93,17 +95,26 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_par
{
if (params.inputs[input.index].LogicalSize() == params.output.Feature().v &&
params.inputs[input.index].LogicalSize() == params.inputs[input.index].Feature().v) {
jit.AddConstant(MakeJitConstant(name,
"BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, 1, input" + std::to_string(input.index) +
", INPUT"+std::to_string(input.index)+"_GET_INDEX(b, f_block*16, y, x))"));
std::string block_read_str = "BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, " +
"1, " +
"input" + std::to_string(input.index) +
", INPUT" + std::to_string(input.index);
if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) {
jit.AddConstant(MakeJitConstant(name, block_read_str + "_GET_INDEX(b, f_block*16, y, x))"));
} else {
jit.AddConstant(MakeJitConstant(name, block_read_str + "_GET_INDEX(b, f_block*16, z, y, x))"));
}
} else if (params.inputs[input.index].LogicalSize() == 1) {
jit.AddConstant(MakeJitConstant(name,
"input" + std::to_string(input.index) +
"[0]"));
} else {
const std::string idx_order = "INPUT" + std::to_string(input.index) + "_IDX_ORDER";
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) {
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
} else {
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, z, y, x"));
}
bool feature_broadcasting = (params.inputs[input_idx].Feature().v == 1 && params.output.Feature().v != 1);
const std::string block_read_str = "TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, BLOCK_SIZE), BLOCK_READN(INPUT" +
@ -194,7 +205,14 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
if (!params.fused_ops.empty()) {
kernel_selector::Datatype input_dt = GetAccumulatorType(params);
FusedOpsConfiguration conf = {"", {"b", "f_block*16", "y", "x"}, "res", input_dt, blockSize};
std::vector<std::string> idx_order;
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
idx_order = {"b", "f_block*16", "y", "x"};
} else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
idx_order = {"b", "f_block*16", "z", "y", "x"};
}
FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, blockSize};
conf.load_type = FusedOpsConfiguration::LoadType::LT_ALIGNED_READ;
conf.vec_axis = Tensor::DataChannelName::X;
@ -230,6 +248,7 @@ bool EltwiseKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_param
for (size_t i = 0; i < params.inputs.size(); i++) {
if ((params.inputs[i].GetLayout() != DataLayout::b_fs_yx_fsv16) &&
(params.inputs[i].GetLayout() != DataLayout::b_fs_zyx_fsv16) &&
!IsBroadcastingPossibleInput(params.inputs[i], params.output)) {
return false;
}
@ -277,7 +296,7 @@ EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv16::SetDefault(const el
DispatchData dispatchData;
dispatchData.gws[0] = Align(params.output.Feature().v, 16);
dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v;
dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v * params.output.Z().v;
dispatchData.gws[2] = params.output.Batch().v;
dispatchData.lws[0] = 16;

View File

@ -34,22 +34,37 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
#endif
)
{
const uint f_block = get_group_id(0);
const uint y = (uint)get_global_id(1) / BLOCKS_COUNT;
const uint x = ((uint)get_global_id(1) % BLOCKS_COUNT) * BLOCK_SIZE;
const uint b = get_global_id(2);
const uint f_block = (uint)get_group_id(0);
const uint zyx = (uint)get_global_id(1);
const uint b = (uint)get_global_id(2);
const uint x = (zyx % BLOCKS_COUNT) * BLOCK_SIZE;
#if OUTPUT_DIMS == 5
const uint zy = zyx / BLOCKS_COUNT;
const uint z = zy / OUTPUT_SIZE_Y;
const uint y = zy % OUTPUT_SIZE_Y;
#else
const uint z = 0;
const uint y = zyx / BLOCKS_COUNT;
#endif
// Output offset calculations:
const uint output_x_pitch = FEATURE_SLICE_SIZE;
const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
#if OUTPUT_DIMS == 5
const uint output_z_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
const uint output_fs_pitch = output_z_pitch * (OUTPUT_PAD_BEFORE_SIZE_Z + OUTPUT_SIZE_Z + OUTPUT_PAD_AFTER_SIZE_Z);
#else
const uint output_z_pitch = 0;
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
#endif
const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
const uint output_offset = b * output_b_pitch +
(f_block + output_fs_pad_before) * output_fs_pitch +
(z + OUTPUT_PAD_BEFORE_SIZE_Z) * output_z_pitch +
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;