[GPU] Fix VA OD sample pipeline test failure with batch=64 (#8128)

* Fix VA OD sample pipeline test failure with batch=64

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>

* Update cpu/gpu switching rule for maximum LWS of device capabilities

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>

* Add Validate function to compare batch size with device capabilities

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park 2021-10-27 17:51:21 +09:00 committed by GitHub
parent d65e7d4d4f
commit c6c3e33a3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 16 deletions

View File

@ -123,7 +123,7 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
dispatchData.lws = {1, 1, 1};
} else {
dispatchData.gws = {input.Batch().v, 1, 1};
dispatchData.lws = {1, 1, 1};
dispatchData.lws = {input.Batch().v, 1, 1};
}
} else {
dispatchData.gws = {1, 1, 1};
@ -133,6 +133,21 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
return dispatchData;
}
bool DetectionOutputKernelRef::Validate(const Params& p, const optional_params& o) const {
const detection_output_params& params = static_cast<const detection_output_params&>(p);
const auto input = params.inputs[0];
const auto batches = input.Batch().v;
const bool bSupportedBatch = batches <= params.engineInfo.maxWorkGroupSize;
if (!bSupportedBatch) {
return false;
}
return true;
}
void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const {
if (params.detectOutParams.decrease_label_id) {
if (idx == 0) {
@ -182,6 +197,9 @@ void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params&
KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT);
if (!Validate(params, options))
return {};
constexpr size_t kKernelsNum = 4;
KernelData kd = KernelData::Default<detection_output_params>(params, kKernelsNum);
const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
@ -196,7 +214,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
constexpr size_t buffer_bytes = 10; // The size of struct Scores in detection_output_gpu_ref.cl
size_t buffer_stride = num_prior_boxes * buffer_bytes;
size_t buffer_size = num_of_images * num_classes * buffer_stride;
size_t num_scores_size = num_of_images * (num_classes + 1) * sizeof(int);
size_t num_scores_size = num_of_images * (num_classes + 2) * sizeof(int);
kd.internalBufferSizes.push_back(buffer_size);
if (detectOutParams.detectOutParams.decrease_label_id) {

View File

@ -70,6 +70,7 @@ public:
protected:
virtual JitConstants GetJitConstants(const detection_output_params& params) const;
bool Validate(const Params& p, const optional_params& o) const override;
void SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const;
};
} // namespace kernel_selector

View File

@ -57,7 +57,7 @@
// =================================================================================================================
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define NUM_CLASSES_ACC (NUM_CLASSES + 1)
#define NUM_CLASSES_ACC (NUM_CLASSES + 2)
typedef struct __attribute__((__packed__)) {
short classId;
@ -164,11 +164,11 @@ inline int FUNC(get_accumulated_detections)(__global int* size_buf, int batch_id
return acc_num;
}
inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id) {
inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id, int offset) {
int start_idx = 0;
for (uint idx_batch = 0; idx_batch < batch_id; idx_batch++)
{
const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES];
const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES + offset];
start_idx += (num_det > KEEP_TOP_K ? KEEP_TOP_K: num_det);
}
return start_idx;
@ -658,16 +658,20 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
__global int *buffer1) {
const int batchId = get_global_id(0);
__local int class_offset[NUM_CLASSES_ACC];
const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId);
buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det;
// the total number of detections is also stored in the extra space of buffer
// for case where the number of detections is larger than keep_top_k
buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES + 1] = total_det;
barrier(CLK_GLOBAL_MEM_FENCE);
if (KEEP_TOP_K > -1 && total_det > KEEP_TOP_K) {
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[0];
int num_det = 0;
int scores_offset = (batchId * NUM_CLASSES * NUM_OF_PRIORS);
int scores_size_offset = batchId * NUM_CLASSES_ACC;
for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
const int acc_num = buffer1[scores_size_offset + idx_class];
@ -689,18 +693,22 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
}
// calculate starting point of each class
class_offset[0] = 0;
// store the current number of detections for buffer reuse
int prev_offset = buffer1[scores_size_offset];
buffer1[scores_size_offset] = 0;
for (int i = 1; i < NUM_CLASSES_ACC; ++i) {
class_offset[i] = class_offset[i - 1] + buffer1[scores_size_offset + i - 1];
int cur_offset = buffer1[scores_size_offset + i];
buffer1[scores_size_offset + i] = buffer1[scores_size_offset + i - 1] + prev_offset;
prev_offset = cur_offset;
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 1);
for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
SCORES_INFO score_info;
score_info = scoresList[scores_offset + idx_num_det];
const int idx = startIdx + class_offset[score_info.classId];
const int idx = startIdx + buffer1[scores_size_offset + score_info.classId];
output[idx * OUTPUT_ROW_SIZE] = TO_OUTPUT_TYPE(batchId);
output[idx * OUTPUT_ROW_SIZE + 1] = TO_OUTPUT_TYPE((DECREASE_LABEL_ID) ? score_info.classId - 1 : score_info.classId);
output[idx * OUTPUT_ROW_SIZE + 2] = TO_OUTPUT_TYPE(score_info.score);
@ -719,10 +727,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
ymax = max(TO_INPUT0_TYPE(0.0), min(TO_INPUT0_TYPE(1.0), ymax));
}
vstore4((OUTPUT_TYPE4)(xmin, ymin, xmax, ymax), 0, output + idx * OUTPUT_ROW_SIZE + 3);
class_offset[score_info.classId]++;
// increase starting point for the next detection in class
buffer1[scores_size_offset + score_info.classId]++;
}
} else {
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 0);
int outputIdx = 0;
for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
int scores_size_offset = batchId * NUM_CLASSES_ACC + idx_class;

View File

@ -1081,11 +1081,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
preferred_impl = _forcing_map.at(node.id()).second;
} else if (node.is_type<detection_output>()) {
const auto& program = node.get_program();
const auto& device_info = program.get_engine().get_device_info();
const size_t lws_max = device_info.max_work_group_size;
auto& detection_output_node = node.as<detection_output>();
auto confidence_layout = detection_output_node.confidence().get_output_layout();
auto prim = detection_output_node.get_primitive();
if (confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 && prim->top_k <= 400 &&
prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
if (confidence_layout.size.batch[0] <= lws_max && confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 &&
prim->top_k <= 400 && prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
preferred_impl = impl_types::ocl;
else
preferred_impl = impl_types::cpu;