[GPU] Add estimation of required memory for CAFFE_OPT_2 stage of GPU Detection Output (#11001)

2022-03-21 12:06:07 +03:00 · 2022-03-21 12:06:07 +03:00 · 782ef6b42e
commit 782ef6b42e
parent b480a49d66
2 changed files with 36 additions and 3 deletions
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
@ -261,6 +261,35 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
                cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
            } else {
                if (detectOutParams.detectOutParams.top_k > 0) {
+                    auto estimateRegPressure = [&]() {
+                        // Assume that the kernel is compiled with SIMD16 instuctions
+                        const size_t simd = 16;
+                        const size_t reg_num = 128;
+                        const size_t bytes_per_reg = 32;
+                        const size_t max_reg_bytes = reg_num * bytes_per_reg;
+
+                        size_t bytes_used = 0;
+                        const auto num_prior_boxes = detectOutParams.inputs[1].Feature().v / detectOutParams.detectOutParams.num_classes;
+                        const auto top_k = std::min(detectOutParams.detectOutParams.top_k, (int32_t)num_prior_boxes);
+
+                        // Memory buffer for decoded_bboxes array
+                        bytes_used += top_k * 4 * BytesPerElement(detectOutParams.inputs[0].GetDType());
+                        // Memory buffer for decoded_bbox_cur and decoded_bbox_kept arrays
+                        bytes_used += 8 * BytesPerElement(detectOutParams.inputs[0].GetDType());
+                        // Memory for get_decoded_bbox function execution
+                        bytes_used += (4 * BytesPerElement(detectOutParams.inputs[2].GetDType()) + 12 * 4);
+                        // Memory for jaccardOverlap function execution
+                        bytes_used += 5 * BytesPerElement(detectOutParams.inputs[0].GetDType());
+                        // Approximate amount of additional memory for local variables
+                        bytes_used += 10 * 4;
+                        bytes_used *= simd;
+
+                        return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
+                    };
+
+                    if (estimateRegPressure() > 0.8)
+                        cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY", "true"));
+
                    cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE_OPT", "true"));
                } else {
                    cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"));
--- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/detection_output_gpu_ref.cl
@ -40,8 +40,8 @@
 //   from previous kernel and output using single work-group.
 //     This kernel will produce the results of the final detections form the result of previous kernel.
 //     If the total of detections per each batch is greater than KEEP_TOP_K, detections are sorted using
-//     iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about 
-//     filetered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2].
+//     iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about
+//     filtered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2].
 //
 // =================================================================================================================
 // Required jit constants:
@ -565,7 +565,11 @@ KERNEL (detection_output_stage_2_nms_caffe)(__global INPUT0_TYPE* input_location
    const int classId = get_global_id(1);
    const int loc_label = ((SHARE_LOCATION)? 0 : classId);
    const int scoresInfoIdx = batchId * NUM_CLASSES_ACC + classId;
-    INPUT0_TYPE decoded_bboxes[TOP_K * 4];
+#ifdef USE_LOCAL_MEMORY
+    __local   INPUT0_TYPE decoded_bboxes[TOP_K * 4];
+#else
+    __private INPUT0_TYPE decoded_bboxes[TOP_K * 4];
+#endif

    __global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[(batchId * NUM_CLASSES + classId) * BUFFER_STRIDE];