diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp index f788224ada4..0e4476ca142 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp @@ -261,6 +261,35 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true")); } else { if (detectOutParams.detectOutParams.top_k > 0) { + auto estimateRegPressure = [&]() { + // Assume that the kernel is compiled with SIMD16 instuctions + const size_t simd = 16; + const size_t reg_num = 128; + const size_t bytes_per_reg = 32; + const size_t max_reg_bytes = reg_num * bytes_per_reg; + + size_t bytes_used = 0; + const auto num_prior_boxes = detectOutParams.inputs[1].Feature().v / detectOutParams.detectOutParams.num_classes; + const auto top_k = std::min(detectOutParams.detectOutParams.top_k, (int32_t)num_prior_boxes); + + // Memory buffer for decoded_bboxes array + bytes_used += top_k * 4 * BytesPerElement(detectOutParams.inputs[0].GetDType()); + // Memory buffer for decoded_bbox_cur and decoded_bbox_kept arrays + bytes_used += 8 * BytesPerElement(detectOutParams.inputs[0].GetDType()); + // Memory for get_decoded_bbox function execution + bytes_used += (4 * BytesPerElement(detectOutParams.inputs[2].GetDType()) + 12 * 4); + // Memory for jaccardOverlap function execution + bytes_used += 5 * BytesPerElement(detectOutParams.inputs[0].GetDType()); + // Approximate amount of additional memory for local variables + bytes_used += 10 * 4; + bytes_used *= simd; + + return static_cast(bytes_used) / static_cast(max_reg_bytes); + }; + + if (estimateRegPressure() > 0.8) + cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY", "true")); + cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE_OPT", "true")); } else { cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true")); diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl index 1963ad626b5..3eaf7008e21 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl @@ -40,8 +40,8 @@ // from previous kernel and output using single work-group. // This kernel will produce the results of the final detections form the result of previous kernel. // If the total of detections per each batch is greater than KEEP_TOP_K, detections are sorted using -// iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about -// filetered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2]. +// iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about +// filtered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2]. // // ================================================================================================================= // Required jit constants: @@ -565,7 +565,11 @@ KERNEL (detection_output_stage_2_nms_caffe)(__global INPUT0_TYPE* input_location const int classId = get_global_id(1); const int loc_label = ((SHARE_LOCATION)? 0 : classId); const int scoresInfoIdx = batchId * NUM_CLASSES_ACC + classId; - INPUT0_TYPE decoded_bboxes[TOP_K * 4]; +#ifdef USE_LOCAL_MEMORY + __local INPUT0_TYPE decoded_bboxes[TOP_K * 4]; +#else + __private INPUT0_TYPE decoded_bboxes[TOP_K * 4]; +#endif __global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[(batchId * NUM_CLASSES + classId) * BUFFER_STRIDE];