From c6c3e33a3a391d666f8e39482603ca22a9ad6b9f Mon Sep 17 00:00:00 2001
From: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
Date: Wed, 27 Oct 2021 17:51:21 +0900
Subject: [PATCH] [GPU] Fix VA OD sample pipeline test failure with batch=64
 (#8128)

* Fix VA OD sample pipeline test failure with batch=64

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>

* Update cpu/gpu switching rule for maximum LWS of device capabilities

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>

* Add Validate function to compare batch size with device capabilities

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
---
 .../detection_output_kernel_ref.cpp           | 22 +++++++++++--
 .../detection_output_kernel_ref.h             |  1 +
 .../cl_kernels/detection_output_gpu_ref.cl    | 33 ++++++++++++-------
 .../thirdparty/clDNN/src/layout_optimizer.cpp |  7 ++--
 4 files changed, 47 insertions(+), 16 deletions(-)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
index 67f599570a8..313976f56be 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
@@ -123,7 +123,7 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
             dispatchData.lws = {1, 1, 1};
         } else {
             dispatchData.gws = {input.Batch().v, 1, 1};
-            dispatchData.lws = {1, 1, 1};
+            dispatchData.lws = {input.Batch().v, 1, 1};
         }
     } else {
         dispatchData.gws = {1, 1, 1};
@@ -133,6 +133,21 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
     return dispatchData;
 }
 
+bool DetectionOutputKernelRef::Validate(const Params& p, const optional_params& o) const {
+    const detection_output_params& params = static_cast<const detection_output_params&>(p);
+
+    const auto input = params.inputs[0];
+    const auto batches = input.Batch().v;
+
+    const bool bSupportedBatch = batches <= params.engineInfo.maxWorkGroupSize;
+
+    if (!bSupportedBatch) {
+        return false;
+    }
+
+    return true;
+}
+
 void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const {
     if (params.detectOutParams.decrease_label_id) {
         if (idx == 0) {
@@ -182,6 +197,9 @@ void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params&
 KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
     assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT);
 
+    if (!Validate(params, options))
+        return {};
+
     constexpr size_t kKernelsNum = 4;
     KernelData kd = KernelData::Default<detection_output_params>(params, kKernelsNum);
     const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
@@ -196,7 +214,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
     constexpr size_t buffer_bytes = 10;  // The size of struct Scores in detection_output_gpu_ref.cl
     size_t buffer_stride = num_prior_boxes * buffer_bytes;
     size_t buffer_size = num_of_images * num_classes * buffer_stride;
-    size_t num_scores_size = num_of_images * (num_classes + 1) * sizeof(int);
+    size_t num_scores_size = num_of_images * (num_classes + 2) * sizeof(int);
 
     kd.internalBufferSizes.push_back(buffer_size);
     if (detectOutParams.detectOutParams.decrease_label_id) {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
index 9f0cf5466c6..c8a9b47db1d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
@@ -70,6 +70,7 @@ public:
 
 protected:
     virtual JitConstants GetJitConstants(const detection_output_params& params) const;
+    bool Validate(const Params& p, const optional_params& o) const override;
     void SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const;
 };
 }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
index 1341f975b80..0c5ae63c0bf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
@@ -57,7 +57,7 @@
 // =================================================================================================================
 
 #define unroll_for __attribute__((opencl_unroll_hint)) for
-#define NUM_CLASSES_ACC (NUM_CLASSES + 1)
+#define NUM_CLASSES_ACC (NUM_CLASSES + 2)
 
 typedef struct __attribute__((__packed__)) {
     short classId;
@@ -164,11 +164,11 @@ inline int FUNC(get_accumulated_detections)(__global int* size_buf, int batch_id
     return acc_num;
 }
 
-inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id) {
+inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id, int offset) {
     int start_idx = 0;
     for (uint idx_batch = 0; idx_batch < batch_id; idx_batch++)
     {
-        const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES];
+        const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES + offset];
         start_idx += (num_det > KEEP_TOP_K ? KEEP_TOP_K: num_det);
     }
     return start_idx;
@@ -658,16 +658,20 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
                                             __global int *buffer1) {
     const int batchId = get_global_id(0);
 
-    __local int class_offset[NUM_CLASSES_ACC];
-
     const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId);
     buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det;
+    // the total number of detections is also stored in the extra space of buffer
+    // for case where the number of detections is larger than keep_top_k
+    buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES + 1] = total_det;
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
 
     if (KEEP_TOP_K > -1 && total_det > KEEP_TOP_K) {
         __global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[0];
         int num_det = 0;
         int scores_offset = (batchId * NUM_CLASSES * NUM_OF_PRIORS);
         int scores_size_offset = batchId * NUM_CLASSES_ACC;
+
         for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
             const int acc_num = buffer1[scores_size_offset + idx_class];
 
@@ -689,18 +693,22 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
         }
 
         // calculate starting point of each class
-        class_offset[0] = 0;
+        // store the current number of detections for buffer reuse
+        int prev_offset = buffer1[scores_size_offset];
+        buffer1[scores_size_offset] = 0;
         for (int i = 1; i < NUM_CLASSES_ACC; ++i) {
-            class_offset[i] = class_offset[i - 1] + buffer1[scores_size_offset + i - 1];
+            int cur_offset = buffer1[scores_size_offset + i];
+            buffer1[scores_size_offset + i] = buffer1[scores_size_offset + i - 1] + prev_offset;
+            prev_offset = cur_offset;
         }
 
-        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_GLOBAL_MEM_FENCE);
 
-        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
+        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 1);
         for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
             SCORES_INFO score_info;
             score_info = scoresList[scores_offset + idx_num_det];
-            const int idx = startIdx + class_offset[score_info.classId];
+            const int idx = startIdx + buffer1[scores_size_offset + score_info.classId];
             output[idx * OUTPUT_ROW_SIZE] = TO_OUTPUT_TYPE(batchId);
             output[idx * OUTPUT_ROW_SIZE + 1] = TO_OUTPUT_TYPE((DECREASE_LABEL_ID) ? score_info.classId - 1 : score_info.classId);
             output[idx * OUTPUT_ROW_SIZE + 2] = TO_OUTPUT_TYPE(score_info.score);
@@ -719,10 +727,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
                 ymax = max(TO_INPUT0_TYPE(0.0), min(TO_INPUT0_TYPE(1.0), ymax));
             }
             vstore4((OUTPUT_TYPE4)(xmin, ymin, xmax, ymax), 0, output + idx * OUTPUT_ROW_SIZE + 3);
-            class_offset[score_info.classId]++;
+            // increase starting point for the next detection in class
+            buffer1[scores_size_offset + score_info.classId]++;
         }
     } else {
-        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
+        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 0);
         int outputIdx = 0;
         for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
             int scores_size_offset = batchId * NUM_CLASSES_ACC + idx_class;
diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
index 34f3c1498cf..49eb50e1def 100644
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@@ -1081,11 +1081,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
     if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
         preferred_impl = _forcing_map.at(node.id()).second;
     } else if (node.is_type<detection_output>()) {
+        const auto& program = node.get_program();
+        const auto& device_info = program.get_engine().get_device_info();
+        const size_t lws_max = device_info.max_work_group_size;
         auto& detection_output_node = node.as<detection_output>();
         auto confidence_layout = detection_output_node.confidence().get_output_layout();
         auto prim = detection_output_node.get_primitive();
-        if (confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 && prim->top_k <= 400 &&
-            prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
+        if (confidence_layout.size.batch[0] <= lws_max && confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 &&
+            prim->top_k <= 400 && prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
             preferred_impl = impl_types::ocl;
         else
             preferred_impl = impl_types::cpu;