[GPU] Update memory location to __local in GPU Detection Output (#11209)

* [GPU] Update memory location to __local in GPU Detection Output

* Replace hardcoded stack size with JIT constant
This commit is contained in:
Sergey Shlyapnikov
2022-03-31 13:13:58 +03:00
committed by GitHub
parent 23476c8eee
commit f2af1ef88a
2 changed files with 60 additions and 13 deletions

View File

@@ -212,6 +212,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
auto num_prior_boxes = (loc_feature_num / (num_loc_classes * prior_box_size));
auto max_wg = detectOutParams.engineInfo.maxWorkGroupSize;
constexpr size_t stack_size = 100; // The size of stack for QuickSort
constexpr size_t buffer_bytes = 10; // The size of struct Scores in detection_output_gpu_ref.cl
size_t buffer_stride = num_prior_boxes * buffer_bytes;
size_t buffer_size = num_of_images * num_classes * buffer_stride;
@@ -229,6 +230,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
auto cldnnJit = GetJitConstants(detectOutParams);
auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, params, options, i);
cldnnJit.AddConstant(MakeJitConstant("BUFFER_STRIDE", buffer_stride));
cldnnJit.AddConstant(MakeJitConstant("QUICK_SORT_STACK_SIZE", stack_size));
if (i == 0) {
if (detectOutParams.detectOutParams.decrease_label_id) {
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
@@ -246,17 +248,25 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
MakeJitConstant("NUM_PRIOR_BLOCKS", num_score_block)});
}
} else if (i == 1) {
if (detectOutParams.detectOutParams.decrease_label_id) {
if (detectOutParams.detectOutParams.decrease_label_id) {
// Always use local memory since LWS size is 1x1x16 (16 WI * 100 (stack size) * 4 (int size) = 6.25 KB of SLM memory)
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
cldnnJit.AddConstants({MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"),
MakeJitConstant("LOCAL_WORK_NUM", dispatchData.lws[2]),
MakeJitConstant("PARTITION_STEP", GetPartitionStep(dispatchData.lws[2]))});
} else {
} else {
// Limit local memory usage for two buffers: __range [LWS1 * LWS2 * 2 * 4 (int size) bytes]
// stack [LWS1 * LWS2 * 100 (stack_size) * 4 (int size) bytes]
auto req_local_mem_size = dispatchData.lws[1] * dispatchData.lws[2] * 2 * 4 +
dispatchData.lws[1] * dispatchData.lws[2] * stack_size * 4;
if (req_local_mem_size < detectOutParams.engineInfo.maxLocalMemSize)
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
cldnnJit.AddConstants({MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"),
MakeJitConstant("LOCAL_CLASS_NUM", dispatchData.lws[1]),
MakeJitConstant("LOCAL_WORK_NUM", dispatchData.lws[2]),
MakeJitConstant("PARTITION_STEP", GetPartitionStep(dispatchData.lws[2]))});
}
} else if (i == 2) {
}
} else if (i == 2) {
if (detectOutParams.detectOutParams.decrease_label_id) {
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
} else {
@@ -295,10 +305,16 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"));
}
}
} else {
} else {
if (detectOutParams.detectOutParams.decrease_label_id) {
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
// Always use local memory since LWS size is 1x1x1
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
} else {
// Limit local memory usage for stack buffer [LWS0 * 100 (stack_size) * 4 (int size) bytes]
auto req_local_mem_size = dispatchData.lws[0] * stack_size * 4;
if (req_local_mem_size < detectOutParams.engineInfo.maxLocalMemSize)
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
cldnnJit.AddConstants({MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"),
MakeJitConstant("LOCAL_BATCHES_NUM", dispatchData.lws[0])});
}

View File

@@ -109,11 +109,16 @@ inline void FUNC(bubbleSortIterative)(__global SCORES_INFO* arr, int l, int h) {
}
}
inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr, int l, int h, bool use_custom_comp) {
inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr,
int l, int h,
#ifdef USE_LOCAL_MEMORY_FOR_STACK
__local int* stack,
#endif
bool use_custom_comp) {
#ifndef USE_LOCAL_MEMORY_FOR_STACK
// Create an auxiliary stack
const int kStackSize = 100;
int stack[kStackSize];
int stack[QUICK_SORT_STACK_SIZE];
#endif
// initialize top of stack
int top = -1;
@@ -134,7 +139,7 @@ inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr, int l, int h, bo
// If there are elements on left side of pivot,
// then push left side to stack
if (p - 1 > l) {
if (top >= (kStackSize - 1)) {
if (top >= (QUICK_SORT_STACK_SIZE - 1)) {
FUNC_CALL(bubbleSortIterative)(arr, l, p - 1);
} else {
stack[++top] = l;
@@ -145,7 +150,7 @@ inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr, int l, int h, bo
// If there are elements on right side of pivot,
// then push right side to stack
if (p + 1 < h) {
if (top >= (kStackSize - 1)) {
if (top >= (QUICK_SORT_STACK_SIZE - 1)) {
FUNC_CALL(bubbleSortIterative)(arr, p + 1, h);
} else {
stack[++top] = p + 1;
@@ -415,6 +420,12 @@ KERNEL (detection_output_stage_1_sort_caffe)(__global uchar *buffer0,
const int localClassId = get_local_id(1);
__local int __range[LOCAL_CLASS_NUM][LOCAL_WORK_NUM * 2];
#ifdef USE_LOCAL_MEMORY_FOR_STACK
// Create an auxiliary stack for QuickSort
__local int stack[QUICK_SORT_STACK_SIZE * LOCAL_CLASS_NUM * LOCAL_WORK_NUM];
__local int *stack_pointer = stack + workItemId * QUICK_SORT_STACK_SIZE + localClassId * LOCAL_WORK_NUM * QUICK_SORT_STACK_SIZE;
#endif
const int scoresInfoNum = buffer1[batchId * NUM_CLASSES_ACC + classId];
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[(batchId * NUM_CLASSES + classId) * BUFFER_STRIDE];
@@ -448,7 +459,11 @@ KERNEL (detection_output_stage_1_sort_caffe)(__global uchar *buffer0,
const int begin_id = __range[localClassId][first_id];
const int end_id = __range[localClassId][first_id + 1];
if (begin_id < end_id) {
#ifdef USE_LOCAL_MEMORY_FOR_STACK
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, stack_pointer, true);
#else
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, true);
#endif
}
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
@@ -470,6 +485,9 @@ KERNEL (detection_output_stage_1_sort_mxnet)(__global uchar *buffer0,
const int workItemId = get_global_id(2);
__local int __range[LOCAL_WORK_NUM * 2];
// Create an auxiliary stack for QuickSort
__local int stack[QUICK_SORT_STACK_SIZE];
const int scoresInfoNum = buffer2[batchId * NUM_CLASSES_ACC + NUM_CLASSES];
if (scoresInfoNum < 2)
return;
@@ -507,7 +525,7 @@ KERNEL (detection_output_stage_1_sort_mxnet)(__global uchar *buffer0,
const int begin_id = __range[first_id];
const int end_id = __range[first_id + 1];
if (begin_id < end_id) {
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, true);
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, stack, true);
}
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
@@ -666,6 +684,12 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
__global int *buffer1) {
const int batchId = get_global_id(0);
#ifdef USE_LOCAL_MEMORY_FOR_STACK
// Create an auxiliary stack for QuickSort
__local int stack[QUICK_SORT_STACK_SIZE];
__local int *stack_pointer = stack + batchId * QUICK_SORT_STACK_SIZE;
#endif
const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId);
buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det;
// the total number of detections is also stored in the extra space of buffer
@@ -692,7 +716,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
buffer1[scores_size_offset + idx_class] = 0;
}
#ifdef USE_LOCAL_MEMORY_FOR_STACK
FUNC_CALL(quickSortIterative)(scoresList + scores_offset, 0, num_det - 1, stack_pointer, true);
#else
FUNC_CALL(quickSortIterative)(scoresList + scores_offset, 0, num_det - 1, true);
#endif
// recalculate valid items for each class
for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
@@ -788,6 +816,9 @@ KERNEL (detection_output_stage_final_mxnet)(__global INPUT0_TYPE* input_location
__global uchar *buffer0,
__global uchar *buffer1,
__global int *buffer2) {
// Create an auxiliary stack for QuickSort
__local int stack[QUICK_SORT_STACK_SIZE];
for (uint idx_image = 0; idx_image < NUM_OF_IMAGES; idx_image++) {
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[idx_image * BUFFER_STRIDE];
const int total_det = buffer2[idx_image * NUM_CLASSES_ACC + NUM_CLASSES];
@@ -805,7 +836,7 @@ KERNEL (detection_output_stage_final_mxnet)(__global INPUT0_TYPE* input_location
num_det += acc_num;
buffer2[scores_size_offset] = 0;
}
FUNC_CALL(quickSortIterative)(scoresList, 0, num_det - 1, true);
FUNC_CALL(quickSortIterative)(scoresList, 0, num_det - 1, stack, true);
for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
int scores_size_offset = idx_image * NUM_CLASSES_ACC + (int)scoresList[idx_num_det].classId;