[GPU] Fixed friendly name of out transpose, improve Pad performance (#8546)

* Fixed friendly names in post-processing nodes * [GPU] Added fsv16 support for pad operation
2021-11-15 17:07:58 +03:00 · 2021-11-15 17:07:58 +03:00 · 3b34f09a9b
commit 3b34f09a9b
parent 5352c2b370
8 changed files with 153 additions and 24 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@ -6,6 +6,7 @@
 #include "ngraph/ops.hpp"
 #include "ngraph_ops/nms_ie_internal.hpp"
 #include "cldnn_itt.h"
+#include "cldnn/runtime/debug_configuration.hpp"

 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@ -231,6 +232,12 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::CreateSingleLayerPrimitive");
    InitProfileInfo(op->get_friendly_name(), op->get_type_name());

+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->verbose >= 2) {
+        GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
+                       << "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
+    }
+
    bool is_created = false;
    const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
    while (op_type_info != nullptr) {
@ -251,8 +258,8 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s

    if (!is_created) {
        IE_THROW() << "Operation: " << op->get_friendly_name()
-                           << " of type " << op->get_type_name()
-                           << "(op::v" << op->get_type_info().version << ") is not supported";
+                   << " of type " << op->get_type_name()
+                   << "(op::v" << op->get_type_info().version << ") is not supported";
    }
 }

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp
@ -23,12 +23,16 @@ ParamsKey BorderKernelRef::GetSupportedKey() const {
    k.EnableInputLayout(DataLayout::byxf);
    k.EnableInputLayout(DataLayout::bfzyx);
    k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);

    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::yxfb);
    k.EnableOutputLayout(DataLayout::byxf);
    k.EnableOutputLayout(DataLayout::bfzyx);
    k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);

    k.EnableTensorOffset();
    k.EnableTensorPitches();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/border_gpu_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/border_gpu_ref.cl
@ -5,10 +5,35 @@
 #include "include/batch_headers/data_types.cl"
 #include "include/batch_headers/fetch_data.cl"

+inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
+{
+#if INPUT0_DIMS < 5
+    return INPUT0_GET_INDEX(b, f, y, x);
+#elif INPUT0_DIMS == 5
+    return INPUT0_GET_INDEX(b, f, z, y, x);
+#elif INPUT0_DIMS == 6
+    return INPUT0_GET_INDEX(b, f, w, z, y, x);
+#else
+#error [clDNN border_gpu_ref.cl]: input format - not supported
+#endif
+}
+
+inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
+{
+#if OUTPUT_DIMS < 5
+    return OUTPUT_GET_INDEX(b, f, y, x);
+#elif OUTPUT_DIMS == 5
+    return OUTPUT_GET_INDEX(b, f, z, y, x);
+#elif OUTPUT_DIMS == 6
+    return OUTPUT_GET_INDEX(b, f, w, z, y, x);
+#else
+#error [clDNN border_gpu_ref.cl]: output format - not supported
+#endif
+}

 KERNEL(border_gpu_ref)(
-    const __global UNIT_TYPE* input,
-    __global UNIT_TYPE* output)
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output)
 {
    // [CONSTEXPR]
    // Border sizes (left-top set and right-bottom set):
@ -72,7 +97,7 @@ KERNEL(border_gpu_ref)(
    const uint out_w  = out_yw / OUTPUT_SIZE_Y;

 #ifdef BORDER_TYPE_CONSTANT
-    UNIT_TYPE in_val = TO_UNIT_TYPE(BORDER_VALUE);
+    INPUT0_TYPE in_val = TO_INPUT0_TYPE(BORDER_VALUE);

    if (out_x >= blt_sx & out_x < in_lx &
        out_y >= blt_sy & out_y < in_ly &
@ -88,7 +113,7 @@ KERNEL(border_gpu_ref)(
        const uint in_f = out_f - blt_sf;
        const uint in_b = out_b - blt_sb;

-        const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
+        const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
        in_val = input[in_pos];
    }
 #elif defined BORDER_TYPE_EDGE
@ -99,8 +124,8 @@ KERNEL(border_gpu_ref)(
    const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
    const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);

-    const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
-    UNIT_TYPE in_val = input[in_pos];
+    const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
+    INPUT0_TYPE in_val = input[in_pos];
 #elif defined BORDER_TYPE_MIRROR
    const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
    const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - 1 - out_y : in_sy + in_ly - 1 - out_y);
@ -109,8 +134,8 @@ KERNEL(border_gpu_ref)(
    const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - 1 - out_f : in_sf + in_lf - 1 - out_f);
    const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - 1 - out_b : in_sb + in_lb - 1 - out_b);

-    const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
-    UNIT_TYPE in_val = input[in_pos];
+    const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
+    INPUT0_TYPE in_val = input[in_pos];
 #elif defined BORDER_TYPE_MIRROR_101
    const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - out_x : in_sx + in_lx - 2 - out_x);
    const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - out_y : in_sy + in_ly - 2 - out_y);
@ -119,12 +144,12 @@ KERNEL(border_gpu_ref)(
    const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - out_f : in_sf + in_lf - 2 - out_f);
    const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - out_b : in_sb + in_lb - 2 - out_b);

-    const uint in_pos = GET_DATA_INDEX_6D(INPUT0, in_b, in_f, in_w, in_z, in_y, in_x);
-    UNIT_TYPE in_val = input[in_pos];
+    const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
+    INPUT0_TYPE in_val = input[in_pos];
 #else
    #error Unsupported border type.
 #endif

-    const uint out_pos = GET_DATA_INDEX_6D(OUTPUT, out_b, out_f, out_w, out_z, out_y, out_x);
+    const uint out_pos = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
    output[out_pos] = in_val;
 }
--- a/inference-engine/thirdparty/clDNN/src/border.cpp
+++ b/inference-engine/thirdparty/clDNN/src/border.cpp
@ -87,23 +87,12 @@ std::string border_inst::to_string(border_node const& node) {
 border_inst::typed_primitive_inst(network& network, border_node const& node) : parent(network, node) {
    auto input_layout = node.input().get_output_layout();

-    const auto input_format = input_layout.format;
    const auto& input_sizes = input_layout.size;

    auto lt_sizes = argument.left_top_sizes.sub(tensor(0));
    auto rb_sizes = argument.right_bottom_sizes.sub(tensor(0));
    auto b_type = argument.type;

-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "Input format",
-                                  input_format.value,
-                                  "supported border primitive input formats",
-                                  format::bfyx,
-                                  format::yxfb,
-                                  format::byxf,
-                                  format::bfzyx,
-                                  format::bfwzyx);
-
    tensor null_tensor = tensor(0);

    // Check if sizes of border are in proper range.
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/border.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/border.cpp
@ -72,22 +72,36 @@ attach_border_impl::attach_border_impl() {
        std::make_tuple(data_types::f16, format::yxfb),
        std::make_tuple(data_types::i8, format::yxfb),
        std::make_tuple(data_types::u8, format::yxfb),
+
        std::make_tuple(data_types::f32, format::bfyx),
        std::make_tuple(data_types::f16, format::bfyx),
        std::make_tuple(data_types::i8, format::bfyx),
        std::make_tuple(data_types::u8, format::bfyx),
+
        std::make_tuple(data_types::f32, format::byxf),
        std::make_tuple(data_types::f16, format::byxf),
        std::make_tuple(data_types::i8, format::byxf),
        std::make_tuple(data_types::u8, format::byxf),
+
        std::make_tuple(data_types::f32, format::bfzyx),
        std::make_tuple(data_types::f16, format::bfzyx),
        std::make_tuple(data_types::i8, format::bfzyx),
        std::make_tuple(data_types::u8, format::bfzyx),
+
        std::make_tuple(data_types::f32, format::bfwzyx),
        std::make_tuple(data_types::f16, format::bfwzyx),
        std::make_tuple(data_types::i8, format::bfwzyx),
        std::make_tuple(data_types::u8, format::bfwzyx),
+
+        std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
+        std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
+
+        std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
+        std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
+        std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
+        std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
    });
 }

--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@ -47,6 +47,7 @@
 #include "lstm_gemm_inst.h"
 #include "mutable_data_inst.h"
 #include "pooling_inst.h"
+#include "border_inst.h"
 #include "primitive_inst.h"
 #include "prior_box_inst.h"
 #include "proposal_inst.h"
@ -1295,6 +1296,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
            prim.type() != cldnn::input_layout::type_id() &&
            prim.type() != cldnn::softmax::type_id() &&
            prim.type() != cldnn::prior_box::type_id() &&
+            prim.type() != cldnn::border::type_id() &&
            prim.type() != cldnn::resample::type_id() &&
            prim.type() != cldnn::crop::type_id() &&
            prim.type() != cldnn::scale::type_id() &&
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp
@ -106,6 +106,84 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant) {
    }
 }

+TEST(border_gpu, basic_fsv16_0x0x1x2_0x0x3x4_border_constant) {
+    //  Input (XY) : 4x3
+    //  Output (XY): 10x7
+
+    constexpr auto in_size_b = 1;
+    constexpr auto in_size_f = 1;
+    constexpr auto in_size_y = 3;
+    constexpr auto in_size_x = 4;
+
+    constexpr auto blt_size_b = 0;
+    constexpr auto blt_size_f = 0;
+    constexpr auto blt_size_y = 1;
+    constexpr auto blt_size_x = 2;
+
+    constexpr auto brb_size_b = 0;
+    constexpr auto brb_size_f = 0;
+    constexpr auto brb_size_y = 3;
+    constexpr auto brb_size_x = 4;
+
+    constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
+    constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
+    constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
+    constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
+
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}});
+
+    topology topology;
+    topology.add(
+        input_layout("input", input->get_layout())
+    );
+    topology.add(
+        reorder("border_input", "input", cldnn::format::b_fs_yx_fsv16, cldnn::data_types::f32),
+        border("border", "border_input",
+               {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
+               {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
+               border_type::constant, 0.0f),
+        reorder("output", "border", cldnn::format::yxfb, cldnn::data_types::f32)
+    );
+
+    std::vector<float> input_data = {
+          1, -2,  3,  -4,
+          5,  6,  7,   8,
+        -10, 12, 13, -13,
+    };
+    std::vector<float> out_data = {
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+        0, 0,   1, -2,  3,  -4, 0, 0, 0, 0,
+        0, 0,   5,  6,  7,   8, 0, 0, 0, 0,
+        0, 0, -10, 12, 13, -13, 0, 0, 0, 0,
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+        0, 0,   0,  0,  0,   0, 0, 0, 0, 0,
+    };
+    set_values(input, input_data);
+
+    cldnn::network network(engine, topology);
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("output").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    ASSERT_EQ(out_data.size(), static_cast<std::size_t>(out_size_b * out_size_f * out_size_y * out_size_x));
+
+    for (auto b = 0; b < out_size_b; ++b) {             // B
+        for (auto f = 0; f < out_size_f; ++f) {         // F
+            for (auto y = 0; y < out_size_y; ++y) {     // Y
+                for (auto x = 0; x < out_size_x; ++x) { // X
+                    auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB
+
+                    EXPECT_EQ(output_ptr[output_off], out_data[output_off]);
+                }
+            }
+        }
+    }
+}
+
 TEST(border_gpu, basic_bfzyx_0x0x1x01_0x0x0x0x3_border_constant) {

    constexpr auto in_size_b = 1;
--- a/ngraph/core/src/preprocess/pre_post_process.cpp
+++ b/ngraph/core/src/preprocess/pre_post_process.cpp
@ -542,10 +542,12 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
        }
        // Apply post-processing
        node = result->get_input_source_output(0);
+        bool post_processing_applied = false;
        if (output->m_postprocess) {
            for (const auto& action : output->m_postprocess->actions()) {
                auto action_result = action({node}, context);
                node = std::get<0>(action_result);
+                post_processing_applied = true;
            }
        }
        // Implicit: Convert element type + layout to user's tensor implicitly
@ -561,10 +563,18 @@ std::shared_ptr<Function> PrePostProcessor::build(const std::shared_ptr<Function
        for (const auto& action : implicit_steps.actions()) {
            auto action_result = action({node}, context);
            node = std::get<0>(action_result);
+            post_processing_applied = true;
        }
        node.get_node_shared_ptr()->set_friendly_name(
            result->get_input_source_output(0).get_node_shared_ptr()->get_friendly_name());

+        // Reset friendly name of input node to avoid names collision
+        // when there is at a new node inserted by post-processing steps
+        // If no new nodes are inserted by post-processing, then we need to preserve friendly name of input
+        // as it's required for old API correct work
+        if (post_processing_applied)
+            result->get_input_source_output(0).get_node_shared_ptr()->set_friendly_name("");
+
        // Create result
        auto new_result = std::make_shared<ov::op::v0::Result>(node);
        new_result->set_friendly_name(result->get_friendly_name());