From c6c3e33a3a391d666f8e39482603ca22a9ad6b9f Mon Sep 17 00:00:00 2001 From: Andrew Kwangwoong Park Date: Wed, 27 Oct 2021 17:51:21 +0900 Subject: [PATCH] [GPU] Fix VA OD sample pipeline test failure with batch=64 (#8128) * Fix VA OD sample pipeline test failure with batch=64 Signed-off-by: Andrew Kwangwoong Park * Update cpu/gpu switching rule for maximum LWS of device capabilities Signed-off-by: Andrew Kwangwoong Park * Add Validate function to compare batch size with device capabilities Signed-off-by: Andrew Kwangwoong Park --- .../detection_output_kernel_ref.cpp | 22 +++++++++++-- .../detection_output_kernel_ref.h | 1 + .../cl_kernels/detection_output_gpu_ref.cl | 33 ++++++++++++------- .../thirdparty/clDNN/src/layout_optimizer.cpp | 7 ++-- 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp index 67f599570a8..313976f56be 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp @@ -123,7 +123,7 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params& dispatchData.lws = {1, 1, 1}; } else { dispatchData.gws = {input.Batch().v, 1, 1}; - dispatchData.lws = {1, 1, 1}; + dispatchData.lws = {input.Batch().v, 1, 1}; } } else { dispatchData.gws = {1, 1, 1}; @@ -133,6 +133,21 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params& return dispatchData; } +bool DetectionOutputKernelRef::Validate(const Params& p, const optional_params& o) const { + const detection_output_params& params = static_cast(p); + + const auto input = params.inputs[0]; + const auto batches = input.Batch().v; + + const bool bSupportedBatch = batches <= params.engineInfo.maxWorkGroupSize; + + if (!bSupportedBatch) { + return false; + } + + return true; +} + void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const { if (params.detectOutParams.decrease_label_id) { if (idx == 0) { @@ -182,6 +197,9 @@ void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params& KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT); + if (!Validate(params, options)) + return {}; + constexpr size_t kKernelsNum = 4; KernelData kd = KernelData::Default(params, kKernelsNum); const detection_output_params& detectOutParams = static_cast(params); @@ -196,7 +214,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const constexpr size_t buffer_bytes = 10; // The size of struct Scores in detection_output_gpu_ref.cl size_t buffer_stride = num_prior_boxes * buffer_bytes; size_t buffer_size = num_of_images * num_classes * buffer_stride; - size_t num_scores_size = num_of_images * (num_classes + 1) * sizeof(int); + size_t num_scores_size = num_of_images * (num_classes + 2) * sizeof(int); kd.internalBufferSizes.push_back(buffer_size); if (detectOutParams.detectOutParams.decrease_label_id) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h index 9f0cf5466c6..c8a9b47db1d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h @@ -70,6 +70,7 @@ public: protected: virtual JitConstants GetJitConstants(const detection_output_params& params) const; + bool Validate(const Params& p, const optional_params& o) const override; void SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl index 1341f975b80..0c5ae63c0bf 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl @@ -57,7 +57,7 @@ // ================================================================================================================= #define unroll_for __attribute__((opencl_unroll_hint)) for -#define NUM_CLASSES_ACC (NUM_CLASSES + 1) +#define NUM_CLASSES_ACC (NUM_CLASSES + 2) typedef struct __attribute__((__packed__)) { short classId; @@ -164,11 +164,11 @@ inline int FUNC(get_accumulated_detections)(__global int* size_buf, int batch_id return acc_num; } -inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id) { +inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id, int offset) { int start_idx = 0; for (uint idx_batch = 0; idx_batch < batch_id; idx_batch++) { - const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES]; + const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES + offset]; start_idx += (num_det > KEEP_TOP_K ? KEEP_TOP_K: num_det); } return start_idx; @@ -658,16 +658,20 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location __global int *buffer1) { const int batchId = get_global_id(0); - __local int class_offset[NUM_CLASSES_ACC]; - const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId); buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det; + // the total number of detections is also stored in the extra space of buffer + // for case where the number of detections is larger than keep_top_k + buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES + 1] = total_det; + + barrier(CLK_GLOBAL_MEM_FENCE); if (KEEP_TOP_K > -1 && total_det > KEEP_TOP_K) { __global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[0]; int num_det = 0; int scores_offset = (batchId * NUM_CLASSES * NUM_OF_PRIORS); int scores_size_offset = batchId * NUM_CLASSES_ACC; + for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) { const int acc_num = buffer1[scores_size_offset + idx_class]; @@ -689,18 +693,22 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location } // calculate starting point of each class - class_offset[0] = 0; + // store the current number of detections for buffer reuse + int prev_offset = buffer1[scores_size_offset]; + buffer1[scores_size_offset] = 0; for (int i = 1; i < NUM_CLASSES_ACC; ++i) { - class_offset[i] = class_offset[i - 1] + buffer1[scores_size_offset + i - 1]; + int cur_offset = buffer1[scores_size_offset + i]; + buffer1[scores_size_offset + i] = buffer1[scores_size_offset + i - 1] + prev_offset; + prev_offset = cur_offset; } - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); - const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId); + const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 1); for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) { SCORES_INFO score_info; score_info = scoresList[scores_offset + idx_num_det]; - const int idx = startIdx + class_offset[score_info.classId]; + const int idx = startIdx + buffer1[scores_size_offset + score_info.classId]; output[idx * OUTPUT_ROW_SIZE] = TO_OUTPUT_TYPE(batchId); output[idx * OUTPUT_ROW_SIZE + 1] = TO_OUTPUT_TYPE((DECREASE_LABEL_ID) ? score_info.classId - 1 : score_info.classId); output[idx * OUTPUT_ROW_SIZE + 2] = TO_OUTPUT_TYPE(score_info.score); @@ -719,10 +727,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location ymax = max(TO_INPUT0_TYPE(0.0), min(TO_INPUT0_TYPE(1.0), ymax)); } vstore4((OUTPUT_TYPE4)(xmin, ymin, xmax, ymax), 0, output + idx * OUTPUT_ROW_SIZE + 3); - class_offset[score_info.classId]++; + // increase starting point for the next detection in class + buffer1[scores_size_offset + score_info.classId]++; } } else { - const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId); + const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 0); int outputIdx = 0; for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) { int scores_size_offset = batchId * NUM_CLASSES_ACC + idx_class; diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index 34f3c1498cf..49eb50e1def 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -1081,11 +1081,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) { preferred_impl = _forcing_map.at(node.id()).second; } else if (node.is_type()) { + const auto& program = node.get_program(); + const auto& device_info = program.get_engine().get_device_info(); + const size_t lws_max = device_info.max_work_group_size; auto& detection_output_node = node.as(); auto confidence_layout = detection_output_node.confidence().get_output_layout(); auto prim = detection_output_node.get_primitive(); - if (confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 && prim->top_k <= 400 && - prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000) + if (confidence_layout.size.batch[0] <= lws_max && confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 && + prim->top_k <= 400 && prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000) preferred_impl = impl_types::ocl; else preferred_impl = impl_types::cpu;