[IE CLDNN] Eltwise b_fs_zyx_fsv16 optimization (#5098)

2021-04-26 15:36:38 +03:00 · 2021-04-26 15:36:38 +03:00 · 16d7d01a27
commit 16d7d01a27
parent 9f2a3d0edc
2 changed files with 47 additions and 13 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
@ -28,7 +28,9 @@ ParamsKey EltwiseKernel_b_fs_yx_fsv16::GetSupportedKey() const {
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
    k.EnableDifferentTypes();
    k.EnableBatching();
    k.EnableTensorPitches();
@ -93,17 +95,26 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::MakeLoadJitConstants(const eltwise_par
                {
                    if (params.inputs[input.index].LogicalSize() == params.output.Feature().v &&
                        params.inputs[input.index].LogicalSize() == params.inputs[input.index].Feature().v) {
-                        jit.AddConstant(MakeJitConstant(name,
-                                                        "BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, 1, input" + std::to_string(input.index) +
-                                                        ", INPUT"+std::to_string(input.index)+"_GET_INDEX(b, f_block*16, y, x))"));
+                        std::string block_read_str = "BLOCK_READN(INPUT" + std::to_string(input.index) + "_TYPE, " +
+                                                     "1, " +
+                                                     "input" + std::to_string(input.index) +
+                                                     ", INPUT" + std::to_string(input.index);
+                        if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) {
+                            jit.AddConstant(MakeJitConstant(name, block_read_str + "_GET_INDEX(b, f_block*16, y, x))"));
+                        } else {
+                            jit.AddConstant(MakeJitConstant(name, block_read_str + "_GET_INDEX(b, f_block*16, z, y, x))"));
+                        }
                    } else if (params.inputs[input.index].LogicalSize() == 1) {
                        jit.AddConstant(MakeJitConstant(name,
                                                        "input" + std::to_string(input.index) +
                                                        "[0]"));
                    } else {
                        const std::string idx_order = "INPUT" + std::to_string(input.index) + "_IDX_ORDER";
-                        jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
-
+                        if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) {
+                            jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, y, x"));
+                        } else {
+                            jit.AddConstant(MakeJitConstant(idx_order, "b, f_block*16, z, y, x"));
+                        }
                        bool feature_broadcasting = (params.inputs[input_idx].Feature().v == 1 && params.output.Feature().v != 1);

                        const std::string block_read_str = "TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, BLOCK_SIZE), BLOCK_READN(INPUT" +
@ -194,7 +205,14 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
    if (!params.fused_ops.empty()) {
        kernel_selector::Datatype input_dt = GetAccumulatorType(params);

-        FusedOpsConfiguration conf = {"", {"b", "f_block*16", "y", "x"}, "res", input_dt, blockSize};
+        std::vector<std::string> idx_order;
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
+            idx_order = {"b", "f_block*16", "y", "x"};
+        } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            idx_order = {"b", "f_block*16", "z", "y", "x"};
+        }
+
+        FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, blockSize};
        conf.load_type = FusedOpsConfiguration::LoadType::LT_ALIGNED_READ;
        conf.vec_axis = Tensor::DataChannelName::X;

@ -230,6 +248,7 @@ bool EltwiseKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_param

    for (size_t i = 0; i < params.inputs.size(); i++) {
        if ((params.inputs[i].GetLayout() != DataLayout::b_fs_yx_fsv16) &&
+            (params.inputs[i].GetLayout() != DataLayout::b_fs_zyx_fsv16) &&
            !IsBroadcastingPossibleInput(params.inputs[i], params.output)) {
            return false;
        }
@ -277,7 +296,7 @@ EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv16::SetDefault(const el
    DispatchData dispatchData;

    dispatchData.gws[0] = Align(params.output.Feature().v, 16);
-    dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v;
+    dispatchData.gws[1] = CeilDiv(params.output.X().v, GetBlockSize(params)) * params.output.Y().v * params.output.Z().v;
    dispatchData.gws[2] = params.output.Batch().v;

    dispatchData.lws[0] = 16;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv16.cl
@ -34,22 +34,37 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
 #endif
 )
 {
-    const uint f_block = get_group_id(0);
-    const uint y = (uint)get_global_id(1) / BLOCKS_COUNT;
-    const uint x = ((uint)get_global_id(1) % BLOCKS_COUNT) * BLOCK_SIZE;
-    const uint b = get_global_id(2);
+    const uint f_block = (uint)get_group_id(0);
+    const uint zyx = (uint)get_global_id(1);
+    const uint b = (uint)get_global_id(2);
+    const uint x = (zyx % BLOCKS_COUNT) * BLOCK_SIZE;
+#if OUTPUT_DIMS == 5
+    const uint zy = zyx / BLOCKS_COUNT;
+    const uint z = zy / OUTPUT_SIZE_Y;
+    const uint y = zy % OUTPUT_SIZE_Y;
+#else
+    const uint z = 0;
+    const uint y = zyx / BLOCKS_COUNT;
+#endif

    // Output offset calculations:
    const uint output_x_pitch = FEATURE_SLICE_SIZE;
-    const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X +  OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
+    const uint output_y_pitch = output_x_pitch * (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X);
+#if OUTPUT_DIMS == 5
+    const uint output_z_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
+    const uint output_fs_pitch = output_z_pitch * (OUTPUT_PAD_BEFORE_SIZE_Z + OUTPUT_SIZE_Z + OUTPUT_PAD_AFTER_SIZE_Z);
+#else
+    const uint output_z_pitch = 0;
+    const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
+#endif
    const uint output_total_f_size = OUTPUT_PAD_BEFORE_FEATURE_NUM + OUTPUT_FEATURE_NUM + OUTPUT_PAD_AFTER_FEATURE_NUM;
-    const uint output_fs_pitch = output_y_pitch * (OUTPUT_PAD_BEFORE_SIZE_Y +  OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y);
    const uint output_b_pitch = output_fs_pitch * ((output_total_f_size + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);

    const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;

    const uint output_offset = b * output_b_pitch +
                               (f_block + output_fs_pad_before) * output_fs_pitch +
+                               (z + OUTPUT_PAD_BEFORE_SIZE_Z) * output_z_pitch +
                               (y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
                               (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;