[GPU] Add estimation of required memory for CAFFE_OPT_2 stage of GPU Detection Output (#11001)

This commit is contained in:
Sergey Shlyapnikov 2022-03-21 12:06:07 +03:00 committed by GitHub
parent b480a49d66
commit 782ef6b42e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 3 deletions

View File

@ -261,6 +261,35 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
} else {
if (detectOutParams.detectOutParams.top_k > 0) {
auto estimateRegPressure = [&]() {
// Assume that the kernel is compiled with SIMD16 instuctions
const size_t simd = 16;
const size_t reg_num = 128;
const size_t bytes_per_reg = 32;
const size_t max_reg_bytes = reg_num * bytes_per_reg;
size_t bytes_used = 0;
const auto num_prior_boxes = detectOutParams.inputs[1].Feature().v / detectOutParams.detectOutParams.num_classes;
const auto top_k = std::min(detectOutParams.detectOutParams.top_k, (int32_t)num_prior_boxes);
// Memory buffer for decoded_bboxes array
bytes_used += top_k * 4 * BytesPerElement(detectOutParams.inputs[0].GetDType());
// Memory buffer for decoded_bbox_cur and decoded_bbox_kept arrays
bytes_used += 8 * BytesPerElement(detectOutParams.inputs[0].GetDType());
// Memory for get_decoded_bbox function execution
bytes_used += (4 * BytesPerElement(detectOutParams.inputs[2].GetDType()) + 12 * 4);
// Memory for jaccardOverlap function execution
bytes_used += 5 * BytesPerElement(detectOutParams.inputs[0].GetDType());
// Approximate amount of additional memory for local variables
bytes_used += 10 * 4;
bytes_used *= simd;
return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
};
if (estimateRegPressure() > 0.8)
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY", "true"));
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE_OPT", "true"));
} else {
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"));

View File

@ -40,8 +40,8 @@
// from previous kernel and output using single work-group.
// This kernel will produce the results of the final detections form the result of previous kernel.
// If the total of detections per each batch is greater than KEEP_TOP_K, detections are sorted using
// iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about
// filetered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2].
// iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about
// filtered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2].
//
// =================================================================================================================
// Required jit constants:
@ -565,7 +565,11 @@ KERNEL (detection_output_stage_2_nms_caffe)(__global INPUT0_TYPE* input_location
const int classId = get_global_id(1);
const int loc_label = ((SHARE_LOCATION)? 0 : classId);
const int scoresInfoIdx = batchId * NUM_CLASSES_ACC + classId;
INPUT0_TYPE decoded_bboxes[TOP_K * 4];
#ifdef USE_LOCAL_MEMORY
__local INPUT0_TYPE decoded_bboxes[TOP_K * 4];
#else
__private INPUT0_TYPE decoded_bboxes[TOP_K * 4];
#endif
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[(batchId * NUM_CLASSES + classId) * BUFFER_STRIDE];