[GPU] Fix VA OD sample pipeline test failure with batch=64 (#8128)

* Fix VA OD sample pipeline test failure with batch=64 Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * Update cpu/gpu switching rule for maximum LWS of device capabilities Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * Add Validate function to compare batch size with device capabilities Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
2021-10-27 17:51:21 +09:00 · 2021-10-27 17:51:21 +09:00 · c6c3e33a3a
commit c6c3e33a3a
parent d65e7d4d4f
4 changed files with 47 additions and 16 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
@ -123,7 +123,7 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
            dispatchData.lws = {1, 1, 1};
        } else {
            dispatchData.gws = {input.Batch().v, 1, 1};
-            dispatchData.lws = {1, 1, 1};
+            dispatchData.lws = {input.Batch().v, 1, 1};
        }
    } else {
        dispatchData.gws = {1, 1, 1};
@ -133,6 +133,21 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
    return dispatchData;
 }

+bool DetectionOutputKernelRef::Validate(const Params& p, const optional_params& o) const {
+    const detection_output_params& params = static_cast<const detection_output_params&>(p);
+
+    const auto input = params.inputs[0];
+    const auto batches = input.Batch().v;
+
+    const bool bSupportedBatch = batches <= params.engineInfo.maxWorkGroupSize;
+
+    if (!bSupportedBatch) {
+        return false;
+    }
+
+    return true;
+}
+
 void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const {
    if (params.detectOutParams.decrease_label_id) {
        if (idx == 0) {
@ -182,6 +197,9 @@ void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params&
 KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
    assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT);

+    if (!Validate(params, options))
+        return {};
+
    constexpr size_t kKernelsNum = 4;
    KernelData kd = KernelData::Default<detection_output_params>(params, kKernelsNum);
    const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
@ -196,7 +214,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
    constexpr size_t buffer_bytes = 10;  // The size of struct Scores in detection_output_gpu_ref.cl
    size_t buffer_stride = num_prior_boxes * buffer_bytes;
    size_t buffer_size = num_of_images * num_classes * buffer_stride;
-    size_t num_scores_size = num_of_images * (num_classes + 1) * sizeof(int);
+    size_t num_scores_size = num_of_images * (num_classes + 2) * sizeof(int);

    kd.internalBufferSizes.push_back(buffer_size);
    if (detectOutParams.detectOutParams.decrease_label_id) {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
@ -70,6 +70,7 @@ public:

 protected:
    virtual JitConstants GetJitConstants(const detection_output_params& params) const;
+    bool Validate(const Params& p, const optional_params& o) const override;
    void SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const;
 };
 }  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
@ -57,7 +57,7 @@
 // =================================================================================================================

 #define unroll_for __attribute__((opencl_unroll_hint)) for
-#define NUM_CLASSES_ACC (NUM_CLASSES + 1)
+#define NUM_CLASSES_ACC (NUM_CLASSES + 2)

 typedef struct __attribute__((__packed__)) {
    short classId;
@ -164,11 +164,11 @@ inline int FUNC(get_accumulated_detections)(__global int* size_buf, int batch_id
    return acc_num;
 }

-inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id) {
+inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id, int offset) {
    int start_idx = 0;
    for (uint idx_batch = 0; idx_batch < batch_id; idx_batch++)
    {
-        const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES];
+        const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES + offset];
        start_idx += (num_det > KEEP_TOP_K ? KEEP_TOP_K: num_det);
    }
    return start_idx;
@ -658,16 +658,20 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
                                            __global int *buffer1) {
    const int batchId = get_global_id(0);

-    __local int class_offset[NUM_CLASSES_ACC];
-
    const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId);
    buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det;
+    // the total number of detections is also stored in the extra space of buffer
+    // for case where the number of detections is larger than keep_top_k
+    buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES + 1] = total_det;
+
+    barrier(CLK_GLOBAL_MEM_FENCE);

    if (KEEP_TOP_K > -1 && total_det > KEEP_TOP_K) {
        __global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[0];
        int num_det = 0;
        int scores_offset = (batchId * NUM_CLASSES * NUM_OF_PRIORS);
        int scores_size_offset = batchId * NUM_CLASSES_ACC;
+
        for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
            const int acc_num = buffer1[scores_size_offset + idx_class];

@ -689,18 +693,22 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
        }

        // calculate starting point of each class
-        class_offset[0] = 0;
+        // store the current number of detections for buffer reuse
+        int prev_offset = buffer1[scores_size_offset];
+        buffer1[scores_size_offset] = 0;
        for (int i = 1; i < NUM_CLASSES_ACC; ++i) {
-            class_offset[i] = class_offset[i - 1] + buffer1[scores_size_offset + i - 1];
+            int cur_offset = buffer1[scores_size_offset + i];
+            buffer1[scores_size_offset + i] = buffer1[scores_size_offset + i - 1] + prev_offset;
+            prev_offset = cur_offset;
        }

-        barrier(CLK_LOCAL_MEM_FENCE);
+        barrier(CLK_GLOBAL_MEM_FENCE);

-        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
+        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 1);
        for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
            SCORES_INFO score_info;
            score_info = scoresList[scores_offset + idx_num_det];
-            const int idx = startIdx + class_offset[score_info.classId];
+            const int idx = startIdx + buffer1[scores_size_offset + score_info.classId];
            output[idx * OUTPUT_ROW_SIZE] = TO_OUTPUT_TYPE(batchId);
            output[idx * OUTPUT_ROW_SIZE + 1] = TO_OUTPUT_TYPE((DECREASE_LABEL_ID) ? score_info.classId - 1 : score_info.classId);
            output[idx * OUTPUT_ROW_SIZE + 2] = TO_OUTPUT_TYPE(score_info.score);
@ -719,10 +727,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
                ymax = max(TO_INPUT0_TYPE(0.0), min(TO_INPUT0_TYPE(1.0), ymax));
            }
            vstore4((OUTPUT_TYPE4)(xmin, ymin, xmax, ymax), 0, output + idx * OUTPUT_ROW_SIZE + 3);
-            class_offset[score_info.classId]++;
+            // increase starting point for the next detection in class
+            buffer1[scores_size_offset + score_info.classId]++;
        }
    } else {
-        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
+        const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 0);
        int outputIdx = 0;
        for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
            int scores_size_offset = batchId * NUM_CLASSES_ACC + idx_class;
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -1081,11 +1081,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
    if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
        preferred_impl = _forcing_map.at(node.id()).second;
    } else if (node.is_type<detection_output>()) {
+        const auto& program = node.get_program();
+        const auto& device_info = program.get_engine().get_device_info();
+        const size_t lws_max = device_info.max_work_group_size;
        auto& detection_output_node = node.as<detection_output>();
        auto confidence_layout = detection_output_node.confidence().get_output_layout();
        auto prim = detection_output_node.get_primitive();
-        if (confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 && prim->top_k <= 400 &&
-            prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
+        if (confidence_layout.size.batch[0] <= lws_max && confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 &&
+            prim->top_k <= 400 && prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
            preferred_impl = impl_types::ocl;
        else
            preferred_impl = impl_types::cpu;