[IE CLDNN] Eltwise b_fs_zyx_fsv16 optimization (#5098)
This commit is contained in:
parent
9f2a3d0edc
commit
16d7d01a27
@ -28,7 +28,9 @@ ParamsKey EltwiseKernel_b_fs_yx_fsv16::GetSupportedKey() const {
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableBatching();
|
||||
k.EnableTensorPitches();
|
||||
@ -93,17 +95,26 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_par
|
||||
{
|
||||
if (params.inputs[input.index].LogicalSize() == params.output.Feature().v &&
|
||||
params.inputs[input.index].LogicalSize() == params.inputs[input.index].Feature().v) {
|
||||
jit.AddConstant(MakeJitConstant(name,
|
||||
"BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, 1, input" + std::to_string(input.index) +
|
||||
", INPUT"+std::to_string(input.index)+"_GET_INDEX(b, f_block*16, y, x))"));
|
||||
std::string block_read_str = "BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, " +
|
||||
"1, " +
|
||||
"input" + std::to_string(input.index) +
|
||||
", INPUT" + std::to_string(input.index);
|
||||
if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) {
|
||||
jit.AddConstant(MakeJitConstant(name, block_read_str + "_GET_INDEX(b, f_block*16, y, x))"));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant(name, block_read_str + "_GET_INDEX(b, f_block*16, z, y, x))"));
|
||||
}
|
||||
} else if (params.inputs[input.index].LogicalSize() == 1) {
|
||||
jit.AddConstant(MakeJitConstant(name,
|
||||
"input" + std::to_string(input.index) +
|
||||
"[0]"));
|
||||
} else {
|
||||
const std::string idx_order = "INPUT" + std::to_string(input.index) + "_IDX_ORDER";
|
||||
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
|
||||
|
||||
if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) {
|
||||
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, z, y, x"));
|
||||
}
|
||||
bool feature_broadcasting = (params.inputs[input_idx].Feature().v == 1 && params.output.Feature().v != 1);
|
||||
|
||||
const std::string block_read_str = "TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, BLOCK_SIZE), BLOCK_READN(INPUT" +
|
||||
@ -194,7 +205,14 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
|
||||
if (!params.fused_ops.empty()) {
|
||||
kernel_selector::Datatype input_dt = GetAccumulatorType(params);
|
||||
|
||||
FusedOpsConfiguration conf = {"", {"b", "f_block*16", "y", "x"}, "res", input_dt, blockSize};
|
||||
std::vector<std::string> idx_order;
|
||||
if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
|
||||
idx_order = {"b", "f_block*16", "y", "x"};
|
||||
} else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
|
||||
idx_order = {"b", "f_block*16", "z", "y", "x"};
|
||||
}
|
||||
|
||||
FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, blockSize};
|
||||
conf.load_type = FusedOpsConfiguration::LoadType::LT_ALIGNED_READ;
|
||||
conf.vec_axis = Tensor::DataChannelName::X;
|
||||
|
||||
@ -230,6 +248,7 @@ bool EltwiseKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_param
|
||||
|
||||
for (size_t i = 0; i < params.inputs.size(); i++) {
|
||||
if ((params.inputs[i].GetLayout() != DataLayout::b_fs_yx_fsv16) &&
|
||||
(params.inputs[i].GetLayout() != DataLayout::b_fs_zyx_fsv16) &&
|
||||
!IsBroadcastingPossibleInput(params.inputs[i], params.output)) {
|
||||
return false;
|
||||
}
|
||||
@ -277,7 +296,7 @@ EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv16::SetDefault(const el
|
||||
DispatchData dispatchData;
|
||||
|
||||
dispatchData.gws[0] = Align(params.output.Feature().v, 16);
|
||||
dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v;
|
||||
dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v * params.output.Z().v;
|
||||
dispatchData.gws[2] = params.output.Batch().v;
|
||||
|
||||
dispatchData.lws[0] = 16;
|
||||
|
@ -34,22 +34,37 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint f_block = get_group_id(0);
|
||||
const uint y = (uint)get_global_id(1) / BLOCKS_COUNT;
|
||||
const uint x = ((uint)get_global_id(1) % BLOCKS_COUNT) * BLOCK_SIZE;
|
||||
const uint b = get_global_id(2);
|
||||
const uint f_block = (uint)get_group_id(0);
|
||||
const uint zyx = (uint)get_global_id(1);
|
||||
const uint b = (uint)get_global_id(2);
|
||||
const uint x = (zyx % BLOCKS_COUNT) * BLOCK_SIZE;
|
||||
#if OUTPUT_DIMS == 5
|
||||
const uint zy = zyx / BLOCKS_COUNT;
|
||||
const uint z = zy / OUTPUT_SIZE_Y;
|
||||
const uint y = zy % OUTPUT_SIZE_Y;
|
||||
#else
|
||||
const uint z = 0;
|
||||
const uint y = zyx / BLOCKS_COUNT;
|
||||
#endif
|
||||
|
||||
// Output offset calculations:
|
||||
const uint output_x_pitch = FEATURE_SLICE_SIZE;
|
||||
const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
|
||||
const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
|
||||
#if OUTPUT_DIMS == 5
|
||||
const uint output_z_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
|
||||
const uint output_fs_pitch = output_z_pitch * (OUTPUT_PAD_BEFORE_SIZE_Z + OUTPUT_SIZE_Z + OUTPUT_PAD_AFTER_SIZE_Z);
|
||||
#else
|
||||
const uint output_z_pitch = 0;
|
||||
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
|
||||
#endif
|
||||
const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
|
||||
const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
|
||||
const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
|
||||
|
||||
const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
|
||||
|
||||
const uint output_offset = b * output_b_pitch +
|
||||
(f_block + output_fs_pad_before) * output_fs_pitch +
|
||||
(z + OUTPUT_PAD_BEFORE_SIZE_Z) * output_z_pitch +
|
||||
(y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
|
||||
(x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user