From ebbf5e3f10ce3d5bc1b8f451c23b6d311e7502bc Mon Sep 17 00:00:00 2001
From: OlehKravchyshyn <86965088+OlehKravchyshyn@users.noreply.github.com>
Date: Tue, 1 Nov 2022 02:50:10 +0200
Subject: [PATCH] [GPU] add blocked format to experimental detectron detection
 output (#12561)

* [GPU] add blocked format to experimental detectron detection output

* [GPU] add this kernel in whitelist of program.cpp

Co-authored-by: ozhydkov-lohika <ozhydkov@lohika.com>
---
 ...xperimental_detectron_detection_output.cpp |   2 +-
 ...xperimental_detectron_detection_output.cpp |  15 ++-
 .../src/graph/impls/ocl/register.hpp          |   1 +
 src/plugins/intel_gpu/src/graph/program.cpp   |   4 +-
 ...rimental_detectron_detection_output_ref.cl |  62 +++++++++++
 .../ed_do/detection_output_kernel_ref.cpp     |  10 +-
 ...al_detectron_detection_output_gpu_test.cpp | 105 +++++++++++++-----
 7 files changed, 160 insertions(+), 39 deletions(-)
diff --git a/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp b/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp
index 2a1c390687e..daaa48fc4c5 100644
--- a/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp
+++ b/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp
@@ -20,7 +20,7 @@ layout experimental_detectron_detection_output_inst::calc_output_layout(
     const layout data_layout = impl_param.get_input_layout();
     auto desc = impl_param.typed_desc<experimental_detectron_detection_output>();
 
-    return layout(data_layout.data_type, format::bfyx, {static_cast<int>(desc->max_detections_per_image), 4, 1, 1});
+    return layout(data_layout.data_type, data_layout.format, {static_cast<int>(desc->max_detections_per_image), 4, 1, 1});
 }
 
 std::string experimental_detectron_detection_output_inst::to_string(
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp
index 8a8cc45c065..b148716df33 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp
@@ -70,10 +70,17 @@ public:
 
 namespace detail {
 attach_experimental_detectron_detection_output_impl::attach_experimental_detectron_detection_output_impl() {
-    implementation_map<experimental_detectron_detection_output>::add(
-        impl_types::ocl,
-        experimental_detectron_detection_output_impl::create,
-        {std::make_tuple(data_types::f16, format::bfyx), std::make_tuple(data_types::f32, format::bfyx)});
+  const std::vector<data_types> types {data_types::f16, data_types::f32};
+  const std::vector<format::type> formats = {format::bfyx,
+                        format::b_fs_yx_fsv16,
+                        format::b_fs_yx_fsv32,
+                        format::bs_fs_yx_bsv16_fsv16,
+                        format::bs_fs_yx_bsv32_fsv32,
+                        format::bs_fs_yx_bsv32_fsv16};
+
+  implementation_map<experimental_detectron_detection_output>::add(
+      impl_types::ocl, experimental_detectron_detection_output_impl::create,
+      types, formats);
 }
 }  // namespace detail
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
index a483a10ca16..ca6372ce61a 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
@@ -25,6 +25,7 @@
 #include "intel_gpu/primitives/depth_to_space.hpp"
 #include "intel_gpu/primitives/detection_output.hpp"
 #include "intel_gpu/primitives/eltwise.hpp"
+#include "intel_gpu/primitives/experimental_detectron_detection_output.hpp"
 #include "intel_gpu/primitives/experimental_detectron_prior_grid_generator.hpp"
 #include "intel_gpu/primitives/experimental_detectron_roi_feature_extractor.hpp"
 #include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 93ef1c27b0a..a643ff7ed53 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1451,7 +1451,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::generate_proposals::type_id() &&
             prim.type() != cldnn::reverse::type_id() &&
             prim.type() != cldnn::reorg_yolo::type_id() &&
-            prim.type() != cldnn::scatter_elements_update::type_id()) {
+            prim.type() != cldnn::scatter_elements_update::type_id() &&
+            prim.type() != cldnn::experimental_detectron_detection_output::type_id()) {
             can_use_fsv16 = false;
         }
 
@@ -1494,6 +1495,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::reverse::type_id() &&
             prim.type() != cldnn::reorg_yolo::type_id() &&
             prim.type() != cldnn::scatter_elements_update::type_id() &&
+            prim.type() != cldnn::experimental_detectron_detection_output::type_id() &&
             prim.type() != cldnn::deconvolution::type_id()) {
             can_use_bs_fs_yx_bsv16_fsv16 = false;
         }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl
index 00eb0e71a5c..58dfdda19e7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl
@@ -138,7 +138,15 @@ KERNEL(eddo_ref_stage_0)
     size_t class_idx = get_global_id(1);
 #endif
 
+#ifdef USE_BLOCKED_FORMAT
+    INPUT_TYPE4 box;
+    box[0] = boxes[INPUT0_GET_INDEX(roi_idx, 0, 0, 0)];
+    box[1] = boxes[INPUT0_GET_INDEX(roi_idx, 1, 0, 0)];
+    box[2] = boxes[INPUT0_GET_INDEX(roi_idx, 2, 0, 0)];
+    box[3] = boxes[INPUT0_GET_INDEX(roi_idx, 3, 0, 0)];
+#else
     INPUT_TYPE4 box = vload4(roi_idx, boxes);
+#endif
 
     if (any(islessequal(box.hi - box.lo, ZERO2))) {
         const int refined_offset = roi_count * class_idx + roi_idx;
@@ -152,7 +160,16 @@ KERNEL(eddo_ref_stage_0)
         // center location of box
         const INPUT_TYPE2 center = box.lo + HALF_ONE * box_size;
 
+#ifdef USE_BLOCKED_FORMAT
+        INPUT_TYPE4 delta;
+        delta[0] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4, 0, 0)];
+        delta[1] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 1, 0, 0)];
+        delta[2] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 2, 0, 0)];
+        delta[3] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 3, 0, 0)];
+        delta  = delta / DELTA_WEIGHTS;
+#else
         const INPUT_TYPE4 delta = vload4(offset, deltas) / DELTA_WEIGHTS;
+#endif
 
         // new center location according to deltas (dx, dy)
         const INPUT_TYPE2 new_center = delta.lo * box_size + center;
@@ -164,7 +181,15 @@ KERNEL(eddo_ref_stage_0)
             (INPUT_TYPE4)(new_center - HALF_ONE * new_size, new_center + HALF_ONE * new_size - COORDINATE_OFFSET);
 
         // adjust new corner locations to be within the image region
+#ifdef USE_BLOCKED_FORMAT
+        INPUT_TYPE2 img_size;
+        size_t img_idx1 = INPUT3_GET_INDEX(0, 1, 0, 0);
+        size_t img_idx0 = INPUT3_GET_INDEX(0, 0, 0, 0);
+        img_size[0] = im_info[img_idx1];
+        img_size[1] = im_info[img_idx0];
+#else
         const INPUT_TYPE2 img_size = vload2(0, im_info).s10;
+#endif
         new_box = fmax(new_box, ZERO4);
 
         // recompute new width & height
@@ -173,7 +198,13 @@ KERNEL(eddo_ref_stage_0)
         const int refined_offset = roi_count * class_idx + roi_idx;
         vstore4(new_box, refined_offset, refined_boxes);
         refined_box_areas[refined_offset] = new_box_size.x * new_box_size.y;
+
+#ifdef USE_BLOCKED_FORMAT
+        const int scores_offset = INPUT2_GET_INDEX(roi_idx, class_idx, 0, 0);
+        refined_scores[refined_offset] = scores[scores_offset];
+#else
         refined_scores[refined_offset] = scores[offset];
+#endif
     }
 }
 
@@ -295,17 +326,48 @@ KERNEL(eddo_ref_stage_3)
  __global OUTPUT_TYPE* output_scores) {
     size_t i = get_global_id(0);
 
+#ifdef USE_BLOCKED_FORMAT
+    size_t idx0 = OUTPUT_GET_INDEX(i, 0, 0, 0);
+    size_t idx1 = OUTPUT_GET_INDEX(i, 1, 0, 0);
+    size_t idx2 = OUTPUT_GET_INDEX(i, 2, 0, 0);
+    size_t idx3 = OUTPUT_GET_INDEX(i, 3, 0, 0);
+
+    size_t idx_i4 = INPUT4_GET_INDEX(i, 0, 0, 0);
+    size_t idx_i5 = INPUT5_GET_INDEX(i, 0, 0, 0);
+#endif
     if (i < *detection_count) {
         OUTPUT_TYPE score = score_class_index_map[i].score;
         OUTPUT_INDICES_TYPE cls = score_class_index_map[i].class_idx;
         OUTPUT_INDICES_TYPE idx = score_class_index_map[i].box_idx;
+
+#ifdef USE_BLOCKED_FORMAT
+        INPUT_TYPE4 res = vload4(ROI_COUNT * cls + idx, refined_boxes);
+
+        output_boxes[idx0] = res[0];
+        output_boxes[idx1] = res[1];
+        output_boxes[idx2] = res[2];
+        output_boxes[idx3] = res[3];
+        output_scores[idx_i4] = score;
+        output_classes[idx_i5] = cls;
+#else
         vstore4(vload4(ROI_COUNT * cls + idx, refined_boxes), i, output_boxes);
         output_scores[i] = score;
         output_classes[i] = cls;
+#endif
     } else {
+
+#ifdef USE_BLOCKED_FORMAT
+        output_boxes[idx0] = ZERO;
+        output_boxes[idx1] = ZERO;
+        output_boxes[idx2] = ZERO;
+        output_boxes[idx3] = ZERO;
+        output_scores[idx_i4] = ZERO;
+        output_classes[idx_i5] = 0;
+#else
         vstore4(ZERO4, i, output_boxes);
         output_scores[i] = ZERO;
         output_classes[i] = 0;
+#endif
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp
index c74236012f0..6d5da7e8ef8 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp
@@ -21,10 +21,12 @@ ParamsKey ExperimentalDetectronDetectionOutputKernelRef::GetSupportedKey() const
     k.EnableOutputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::INT32);
     k.EnableOutputDataType(Datatype::INT64);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
+
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
     k.EnableBatching();
     k.EnableDifferentTypes();
+    k.EnableTensorPitches();
     return k;
 }
 
@@ -82,7 +84,9 @@ JitConstants ExperimentalDetectronDetectionOutputKernelRef::GetJitConstants(
     if (params.class_agnostic_box_regression) {
         jit.AddConstant(MakeJitConstant("CLASS_AGNOSTIC_BOX_REGRESSION", true));
     }
-
+    if (!SimpleLayout(params.inputs[0].GetLayout())) {
+        jit.AddConstant(MakeJitConstant("USE_BLOCKED_FORMAT", true));
+    }
     return jit;
 }
 
diff --git a/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp
index 8f4fde1f11e..ac1bedcb433 100644
--- a/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp
@@ -57,13 +57,18 @@ struct ExperimentalDetectronDetectionOutputParams {
     std::vector<T> expected_scores;
 };
 
+template <typename T>
+using ExperimentalDetectronDetectionOutputParamsWithLayout =
+        std::tuple<ExperimentalDetectronDetectionOutputParams<T>, format::type>;
+
 template <typename T>
 struct experimental_detectron_detection_output_test
-    : public ::testing::TestWithParam<ExperimentalDetectronDetectionOutputParams<T>> {
+    : public ::testing::TestWithParam<ExperimentalDetectronDetectionOutputParamsWithLayout<T>> {
 public:
     void test() {
-        const ExperimentalDetectronDetectionOutputParams<T> param =
-            testing::TestWithParam<ExperimentalDetectronDetectionOutputParams<T>>::GetParam();
+        ExperimentalDetectronDetectionOutputParams<T> param;
+        format::type fmt;
+        std::tie(param, fmt) = this->GetParam();
         auto data_type = type_to_data_type<T>::value;
 
         auto& engine = get_test_engine();
@@ -87,32 +92,42 @@ public:
         const auto input_im_info = engine.allocate_memory({data_type, format::bfyx, tensor{batch(1), feature(3)}});
         set_values(input_im_info, param.im_info);
 
-        const primitive_id output_scores_id = "OutputScores";
-        auto output_scores =
-            engine.allocate_memory({data_type, format::bfyx, tensor{batch(param.max_detections_per_image)}});
-
-        const primitive_id output_classes_id = "OutputClasses";
-        auto output_classes =
-            engine.allocate_memory({data_types::i32, format::bfyx, tensor{batch(param.max_detections_per_image)}});
-
         topology topology;
 
         topology.add(input_layout(input_boxes_id, input_boxes->get_layout()));
         topology.add(input_layout(input_deltas_id, input_deltas->get_layout()));
         topology.add(input_layout(input_scores_id, input_scores->get_layout()));
         topology.add(input_layout(input_im_info_id, input_im_info->get_layout()));
-        topology.add(mutable_data(output_classes_id, output_classes));
-        topology.add(mutable_data(output_scores_id, output_scores));
+        
+        const primitive_id b_input_boxes_id = "BlockedInputBoxes";
+        const primitive_id b_input_deltas_id = "BlockedInputDeltas";
+        const primitive_id b_input_scores_id = "BlockedInputScores";
+        const primitive_id b_input_im_info_id = "BlockedInputImInfo";
 
-        const primitive_id eddo_id = "experimental_detectron_detection_output";
-        const auto eddo_primitive = experimental_detectron_detection_output{
-            eddo_id,
-            input_boxes_id,
-            input_deltas_id,
-            input_scores_id,
-            input_im_info_id,
-            output_classes_id,
-            output_scores_id,
+        topology.add(reorder(b_input_boxes_id, input_boxes_id, fmt, data_type));
+        topology.add(reorder(b_input_deltas_id, input_deltas_id, fmt, data_type));
+        topology.add(reorder(b_input_scores_id, input_scores_id, fmt, data_type));
+        topology.add(reorder(b_input_im_info_id, input_im_info_id, fmt, data_type));
+
+        const primitive_id b_output_scores_id = "BlockedOutputScores";
+        const primitive_id b_output_classes_id = "BlockedOutputClasses";
+        const layout output_scores_layout = {data_type, fmt, tensor{batch(param.max_detections_per_image)}};
+        auto b_output_scores = engine.allocate_memory(output_scores_layout);
+        const layout output_classes_layout = {data_types::i32, fmt, tensor{batch(param.max_detections_per_image)}};
+        auto b_output_classes = engine.allocate_memory(output_classes_layout);
+
+        topology.add(mutable_data(b_output_classes_id, b_output_classes));
+        topology.add(mutable_data(b_output_scores_id, b_output_scores));
+
+        const primitive_id b_eddo_id = "blocked_experimental_detectron_detection_output";
+        const auto b_eddo_primitive = experimental_detectron_detection_output{
+            b_eddo_id,
+            b_input_boxes_id,
+            b_input_deltas_id,
+            b_input_scores_id,
+            b_input_im_info_id,
+            b_output_classes_id,
+            b_output_scores_id,
             param.score_threshold,
             param.nms_threshold,
             param.num_classes,
@@ -123,7 +138,9 @@ public:
             param.deltas_weights,
         };
 
-        topology.add(eddo_primitive);
+        topology.add(b_eddo_primitive);
+        const primitive_id eddo_id = "experimental_detectron_detection_output";
+        topology.add(reorder(eddo_id, b_eddo_primitive /*b_eddo_id*/, format::bfyx, data_type));
 
         network network(engine, topology);
 
@@ -131,20 +148,34 @@ public:
         network.set_input_data(input_deltas_id, input_deltas);
         network.set_input_data(input_scores_id, input_scores);
         network.set_input_data(input_im_info_id, input_im_info);
-
         const auto outputs = network.execute();
 
         const auto output_boxes = outputs.at(eddo_id).get_memory();
-
         const cldnn::mem_lock<T> output_boxes_ptr(output_boxes, get_test_stream());
         ASSERT_EQ(output_boxes_ptr.size(), param.max_detections_per_image * 4);
 
-        const cldnn::mem_lock<int32_t> output_classes_ptr(output_classes, get_test_stream());
-        ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image);
-
+        const primitive_id output_scores_id = "OutputScores";
+        cldnn::topology reorder_score_topology;
+        reorder_score_topology.add(input_layout(b_output_scores_id, output_scores_layout));
+        reorder_score_topology.add(reorder(output_scores_id, b_output_scores_id, format::bfyx, data_type));
+        cldnn::network reorder_score_net{engine, reorder_score_topology};
+        reorder_score_net.set_input_data(b_output_scores_id, b_output_scores);
+        const auto score_result = reorder_score_net.execute();
+        const auto output_scores = score_result.at(output_scores_id).get_memory();
         const cldnn::mem_lock<T> output_scores_ptr(output_scores, get_test_stream());
         ASSERT_EQ(output_scores_ptr.size(), param.max_detections_per_image);
 
+        const primitive_id output_classes_id = "OutputClasses";
+        cldnn::topology reorder_classes_topology;
+        reorder_classes_topology.add(input_layout(b_output_classes_id, output_classes_layout));
+        reorder_classes_topology.add(reorder(output_classes_id, b_output_classes_id, format::bfyx, data_types::i32));
+        cldnn::network reorder_classes_net{engine, reorder_classes_topology};
+        reorder_classes_net.set_input_data(b_output_classes_id, b_output_classes);
+        const auto classes_result = reorder_classes_net.execute();
+        const auto output_classes = classes_result.at(output_classes_id).get_memory();
+        const cldnn::mem_lock<int32_t> output_classes_ptr(output_classes, get_test_stream());
+        ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image);
+
         const auto& expected_boxes = param.expected_boxes;
         const auto& expected_classes = param.expected_classes;
         const auto& expected_scores = param.expected_scores;
@@ -172,6 +203,14 @@ TEST_P(experimental_detectron_detection_output_test_f16, basic) {
     ASSERT_NO_FATAL_FAILURE(test());
 }
 
+const std::vector<format::type> layouts{
+        format::bfyx,
+        format::b_fs_yx_fsv16,
+        format::b_fs_yx_fsv32,
+        format::bs_fs_yx_bsv16_fsv16,
+        format::bs_fs_yx_bsv32_fsv16,
+        format::bs_fs_yx_bsv32_fsv32};
+
 template <typename T>
 std::vector<ExperimentalDetectronDetectionOutputParams<T>> getExperimentalDetectronDetectionOutputParams() {
     std::vector<ExperimentalDetectronDetectionOutputParams<T>> params = {
@@ -390,8 +429,14 @@ std::vector<ExperimentalDetectronDetectionOutputParams<T>> getExperimentalDetect
 
 INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test,
                          experimental_detectron_detection_output_test_f32,
-                         ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<float>()));
+                         testing::Combine(
+                             ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<float>()),
+                             ::testing::ValuesIn(layouts)
+                         ));
 
 INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test,
                          experimental_detectron_detection_output_test_f16,
-                         ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<half_t>()));
+                         testing::Combine(
+                             ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<half_t>()),
+                             ::testing::ValuesIn(layouts)
+                         ));