[GPU] Update memory location to __local in GPU Detection Output (#11209)
* [GPU] Update memory location to __local in GPU Detection Output * Replace hardcoded stack size with JIT constant
This commit is contained in:
committed by
GitHub
parent
23476c8eee
commit
f2af1ef88a
@@ -212,6 +212,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
auto num_prior_boxes = (loc_feature_num / (num_loc_classes * prior_box_size));
|
||||
auto max_wg = detectOutParams.engineInfo.maxWorkGroupSize;
|
||||
|
||||
constexpr size_t stack_size = 100; // The size of stack for QuickSort
|
||||
constexpr size_t buffer_bytes = 10; // The size of struct Scores in detection_output_gpu_ref.cl
|
||||
size_t buffer_stride = num_prior_boxes * buffer_bytes;
|
||||
size_t buffer_size = num_of_images * num_classes * buffer_stride;
|
||||
@@ -229,6 +230,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
auto cldnnJit = GetJitConstants(detectOutParams);
|
||||
auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, params, options, i);
|
||||
cldnnJit.AddConstant(MakeJitConstant("BUFFER_STRIDE", buffer_stride));
|
||||
cldnnJit.AddConstant(MakeJitConstant("QUICK_SORT_STACK_SIZE", stack_size));
|
||||
if (i == 0) {
|
||||
if (detectOutParams.detectOutParams.decrease_label_id) {
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
|
||||
@@ -246,17 +248,25 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
MakeJitConstant("NUM_PRIOR_BLOCKS", num_score_block)});
|
||||
}
|
||||
} else if (i == 1) {
|
||||
if (detectOutParams.detectOutParams.decrease_label_id) {
|
||||
if (detectOutParams.detectOutParams.decrease_label_id) {
|
||||
// Always use local memory since LWS size is 1x1x16 (16 WI * 100 (stack size) * 4 (int size) = 6.25 KB of SLM memory)
|
||||
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
|
||||
cldnnJit.AddConstants({MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"),
|
||||
MakeJitConstant("LOCAL_WORK_NUM", dispatchData.lws[2]),
|
||||
MakeJitConstant("PARTITION_STEP", GetPartitionStep(dispatchData.lws[2]))});
|
||||
} else {
|
||||
} else {
|
||||
// Limit local memory usage for two buffers: __range [LWS1 * LWS2 * 2 * 4 (int size) bytes]
|
||||
// stack [LWS1 * LWS2 * 100 (stack_size) * 4 (int size) bytes]
|
||||
auto req_local_mem_size = dispatchData.lws[1] * dispatchData.lws[2] * 2 * 4 +
|
||||
dispatchData.lws[1] * dispatchData.lws[2] * stack_size * 4;
|
||||
if (req_local_mem_size < detectOutParams.engineInfo.maxLocalMemSize)
|
||||
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
|
||||
cldnnJit.AddConstants({MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"),
|
||||
MakeJitConstant("LOCAL_CLASS_NUM", dispatchData.lws[1]),
|
||||
MakeJitConstant("LOCAL_WORK_NUM", dispatchData.lws[2]),
|
||||
MakeJitConstant("PARTITION_STEP", GetPartitionStep(dispatchData.lws[2]))});
|
||||
}
|
||||
} else if (i == 2) {
|
||||
}
|
||||
} else if (i == 2) {
|
||||
if (detectOutParams.detectOutParams.decrease_label_id) {
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
|
||||
} else {
|
||||
@@ -295,10 +305,16 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
} else {
|
||||
if (detectOutParams.detectOutParams.decrease_label_id) {
|
||||
cldnnJit.AddConstant(MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_MXNET", "true"));
|
||||
// Always use local memory since LWS size is 1x1x1
|
||||
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
|
||||
} else {
|
||||
// Limit local memory usage for stack buffer [LWS0 * 100 (stack_size) * 4 (int size) bytes]
|
||||
auto req_local_mem_size = dispatchData.lws[0] * stack_size * 4;
|
||||
if (req_local_mem_size < detectOutParams.engineInfo.maxLocalMemSize)
|
||||
cldnnJit.AddConstant(MakeJitConstant("USE_LOCAL_MEMORY_FOR_STACK", true));
|
||||
cldnnJit.AddConstants({MakeJitConstant("DO_STAGE_" + std::to_string(i) + "_CAFFE", "true"),
|
||||
MakeJitConstant("LOCAL_BATCHES_NUM", dispatchData.lws[0])});
|
||||
}
|
||||
|
||||
@@ -109,11 +109,16 @@ inline void FUNC(bubbleSortIterative)(__global SCORES_INFO* arr, int l, int h) {
|
||||
}
|
||||
}
|
||||
|
||||
inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr, int l, int h, bool use_custom_comp) {
|
||||
inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr,
|
||||
int l, int h,
|
||||
#ifdef USE_LOCAL_MEMORY_FOR_STACK
|
||||
__local int* stack,
|
||||
#endif
|
||||
bool use_custom_comp) {
|
||||
#ifndef USE_LOCAL_MEMORY_FOR_STACK
|
||||
// Create an auxiliary stack
|
||||
const int kStackSize = 100;
|
||||
int stack[kStackSize];
|
||||
|
||||
int stack[QUICK_SORT_STACK_SIZE];
|
||||
#endif
|
||||
// initialize top of stack
|
||||
int top = -1;
|
||||
|
||||
@@ -134,7 +139,7 @@ inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr, int l, int h, bo
|
||||
// If there are elements on left side of pivot,
|
||||
// then push left side to stack
|
||||
if (p - 1 > l) {
|
||||
if (top >= (kStackSize - 1)) {
|
||||
if (top >= (QUICK_SORT_STACK_SIZE - 1)) {
|
||||
FUNC_CALL(bubbleSortIterative)(arr, l, p - 1);
|
||||
} else {
|
||||
stack[++top] = l;
|
||||
@@ -145,7 +150,7 @@ inline void FUNC(quickSortIterative)(__global SCORES_INFO* arr, int l, int h, bo
|
||||
// If there are elements on right side of pivot,
|
||||
// then push right side to stack
|
||||
if (p + 1 < h) {
|
||||
if (top >= (kStackSize - 1)) {
|
||||
if (top >= (QUICK_SORT_STACK_SIZE - 1)) {
|
||||
FUNC_CALL(bubbleSortIterative)(arr, p + 1, h);
|
||||
} else {
|
||||
stack[++top] = p + 1;
|
||||
@@ -415,6 +420,12 @@ KERNEL (detection_output_stage_1_sort_caffe)(__global uchar *buffer0,
|
||||
const int localClassId = get_local_id(1);
|
||||
__local int __range[LOCAL_CLASS_NUM][LOCAL_WORK_NUM * 2];
|
||||
|
||||
#ifdef USE_LOCAL_MEMORY_FOR_STACK
|
||||
// Create an auxiliary stack for QuickSort
|
||||
__local int stack[QUICK_SORT_STACK_SIZE * LOCAL_CLASS_NUM * LOCAL_WORK_NUM];
|
||||
__local int *stack_pointer = stack + workItemId * QUICK_SORT_STACK_SIZE + localClassId * LOCAL_WORK_NUM * QUICK_SORT_STACK_SIZE;
|
||||
#endif
|
||||
|
||||
const int scoresInfoNum = buffer1[batchId * NUM_CLASSES_ACC + classId];
|
||||
|
||||
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[(batchId * NUM_CLASSES + classId) * BUFFER_STRIDE];
|
||||
@@ -448,7 +459,11 @@ KERNEL (detection_output_stage_1_sort_caffe)(__global uchar *buffer0,
|
||||
const int begin_id = __range[localClassId][first_id];
|
||||
const int end_id = __range[localClassId][first_id + 1];
|
||||
if (begin_id < end_id) {
|
||||
#ifdef USE_LOCAL_MEMORY_FOR_STACK
|
||||
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, stack_pointer, true);
|
||||
#else
|
||||
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, true);
|
||||
#endif
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -470,6 +485,9 @@ KERNEL (detection_output_stage_1_sort_mxnet)(__global uchar *buffer0,
|
||||
const int workItemId = get_global_id(2);
|
||||
__local int __range[LOCAL_WORK_NUM * 2];
|
||||
|
||||
// Create an auxiliary stack for QuickSort
|
||||
__local int stack[QUICK_SORT_STACK_SIZE];
|
||||
|
||||
const int scoresInfoNum = buffer2[batchId * NUM_CLASSES_ACC + NUM_CLASSES];
|
||||
if (scoresInfoNum < 2)
|
||||
return;
|
||||
@@ -507,7 +525,7 @@ KERNEL (detection_output_stage_1_sort_mxnet)(__global uchar *buffer0,
|
||||
const int begin_id = __range[first_id];
|
||||
const int end_id = __range[first_id + 1];
|
||||
if (begin_id < end_id) {
|
||||
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, true);
|
||||
FUNC_CALL(quickSortIterative)(scoresList, begin_id, end_id, stack, true);
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
@@ -666,6 +684,12 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
|
||||
__global int *buffer1) {
|
||||
const int batchId = get_global_id(0);
|
||||
|
||||
#ifdef USE_LOCAL_MEMORY_FOR_STACK
|
||||
// Create an auxiliary stack for QuickSort
|
||||
__local int stack[QUICK_SORT_STACK_SIZE];
|
||||
__local int *stack_pointer = stack + batchId * QUICK_SORT_STACK_SIZE;
|
||||
#endif
|
||||
|
||||
const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId);
|
||||
buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det;
|
||||
// the total number of detections is also stored in the extra space of buffer
|
||||
@@ -692,7 +716,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
|
||||
buffer1[scores_size_offset + idx_class] = 0;
|
||||
}
|
||||
|
||||
#ifdef USE_LOCAL_MEMORY_FOR_STACK
|
||||
FUNC_CALL(quickSortIterative)(scoresList + scores_offset, 0, num_det - 1, stack_pointer, true);
|
||||
#else
|
||||
FUNC_CALL(quickSortIterative)(scoresList + scores_offset, 0, num_det - 1, true);
|
||||
#endif
|
||||
|
||||
// recalculate valid items for each class
|
||||
for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
|
||||
@@ -788,6 +816,9 @@ KERNEL (detection_output_stage_final_mxnet)(__global INPUT0_TYPE* input_location
|
||||
__global uchar *buffer0,
|
||||
__global uchar *buffer1,
|
||||
__global int *buffer2) {
|
||||
// Create an auxiliary stack for QuickSort
|
||||
__local int stack[QUICK_SORT_STACK_SIZE];
|
||||
|
||||
for (uint idx_image = 0; idx_image < NUM_OF_IMAGES; idx_image++) {
|
||||
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[idx_image * BUFFER_STRIDE];
|
||||
const int total_det = buffer2[idx_image * NUM_CLASSES_ACC + NUM_CLASSES];
|
||||
@@ -805,7 +836,7 @@ KERNEL (detection_output_stage_final_mxnet)(__global INPUT0_TYPE* input_location
|
||||
num_det += acc_num;
|
||||
buffer2[scores_size_offset] = 0;
|
||||
}
|
||||
FUNC_CALL(quickSortIterative)(scoresList, 0, num_det - 1, true);
|
||||
FUNC_CALL(quickSortIterative)(scoresList, 0, num_det - 1, stack, true);
|
||||
|
||||
for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
|
||||
int scores_size_offset = idx_image * NUM_CLASSES_ACC + (int)scoresList[idx_num_det].classId;
|
||||
|
||||
Reference in New Issue
Block a user