[GPU] Add estimation of required memory for CAFFE_OPT_2 stage of GPU Detection Output (#11001)
This commit is contained in:
parent
b480a49d66
commit
782ef6b42e
@ -261,6 +261,35 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
|
||||
} else {
|
||||
if (detectOutParams.detectOutParams.top_k > 0) {
|
||||
auto estimateRegPressure = [&]() {
|
||||
// Assume that the kernel is compiled with SIMD16 instuctions
|
||||
const size_t simd = 16;
|
||||
const size_t reg_num = 128;
|
||||
const size_t bytes_per_reg = 32;
|
||||
const size_t max_reg_bytes = reg_num * bytes_per_reg;
|
||||
|
||||
size_t bytes_used = 0;
|
||||
const auto num_prior_boxes = detectOutParams.inputs[1].Feature().v / detectOutParams.detectOutParams.num_classes;
|
||||
const auto top_k = std::min(detectOutParams.detectOutParams.top_k, (int32_t)num_prior_boxes);
|
||||
|
||||
// Memory buffer for decoded_bboxes array
|
||||
bytes_used += top_k * 4 * BytesPerElement(detectOutParams.inputs[0].GetDType());
|
||||
// Memory buffer for decoded_bbox_cur and decoded_bbox_kept arrays
|
||||
bytes_used += 8 * BytesPerElement(detectOutParams.inputs[0].GetDType());
|
||||
// Memory for get_decoded_bbox function execution
|
||||
bytes_used += (4 * BytesPerElement(detectOutParams.inputs[2].GetDType()) + 12 * 4);
|
||||
// Memory for jaccardOverlap function execution
|
||||
bytes_used += 5 * BytesPerElement(detectOutParams.inputs[0].GetDType());
|
||||
// Approximate amount of additional memory for local variables
|
||||
bytes_used += 10 * 4;
|
||||
bytes_used *= simd;
|
||||
|
||||
return static_cast<float>(bytes_used) / static_cast<float>(max_reg_bytes);
|
||||
};
|
||||
|
||||
if (estimateRegPressure() > 0.8)
|
||||
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY", "true"));
|
||||
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE_OPT", "true"));
|
||||
} else {
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"));
|
||||
|
@ -40,8 +40,8 @@
|
||||
// from previous kernel and output using single work-group.
|
||||
// This kernel will produce the results of the final detections form the result of previous kernel.
|
||||
// If the total of detections per each batch is greater than KEEP_TOP_K, detections are sorted using
|
||||
// iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about
|
||||
// filetered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2].
|
||||
// iterative quick sort and it is stored as KEEP_TOP_K. Final detections contain information about
|
||||
// filtered detection described with 7 elements [batch_id, class_id, confidence, x_1, y_1, x_2, y_2].
|
||||
//
|
||||
// =================================================================================================================
|
||||
// Required jit constants:
|
||||
@ -565,7 +565,11 @@ KERNEL (detection_output_stage_2_nms_caffe)(__global INPUT0_TYPE* input_location
|
||||
const int classId = get_global_id(1);
|
||||
const int loc_label = ((SHARE_LOCATION)? 0 : classId);
|
||||
const int scoresInfoIdx = batchId * NUM_CLASSES_ACC + classId;
|
||||
INPUT0_TYPE decoded_bboxes[TOP_K * 4];
|
||||
#ifdef USE_LOCAL_MEMORY
|
||||
__local INPUT0_TYPE decoded_bboxes[TOP_K * 4];
|
||||
#else
|
||||
__private INPUT0_TYPE decoded_bboxes[TOP_K * 4];
|
||||
#endif
|
||||
|
||||
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[(batchId * NUM_CLASSES + classId) * BUFFER_STRIDE];
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user