[GPU] Fix VA OD sample pipeline test failure with batch=64 (#8128)
* Fix VA OD sample pipeline test failure with batch=64 Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * Update cpu/gpu switching rule for maximum LWS of device capabilities Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * Add Validate function to compare batch size with device capabilities Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
This commit is contained in:
parent
d65e7d4d4f
commit
c6c3e33a3a
@ -123,7 +123,7 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
|
||||
dispatchData.lws = {1, 1, 1};
|
||||
} else {
|
||||
dispatchData.gws = {input.Batch().v, 1, 1};
|
||||
dispatchData.lws = {1, 1, 1};
|
||||
dispatchData.lws = {input.Batch().v, 1, 1};
|
||||
}
|
||||
} else {
|
||||
dispatchData.gws = {1, 1, 1};
|
||||
@ -133,6 +133,21 @@ DetectionOutputKernelRef::DispatchData SetDefault(const detection_output_params&
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
bool DetectionOutputKernelRef::Validate(const Params& p, const optional_params& o) const {
|
||||
const detection_output_params& params = static_cast<const detection_output_params&>(p);
|
||||
|
||||
const auto input = params.inputs[0];
|
||||
const auto batches = input.Batch().v;
|
||||
|
||||
const bool bSupportedBatch = batches <= params.engineInfo.maxWorkGroupSize;
|
||||
|
||||
if (!bSupportedBatch) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const {
|
||||
if (params.detectOutParams.decrease_label_id) {
|
||||
if (idx == 0) {
|
||||
@ -182,6 +197,9 @@ void DetectionOutputKernelRef::SetKernelArguments(const detection_output_params&
|
||||
KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT);
|
||||
|
||||
if (!Validate(params, options))
|
||||
return {};
|
||||
|
||||
constexpr size_t kKernelsNum = 4;
|
||||
KernelData kd = KernelData::Default<detection_output_params>(params, kKernelsNum);
|
||||
const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
|
||||
@ -196,7 +214,7 @@ KernelsData DetectionOutputKernelRef::GetKernelsData(const Params& params, const
|
||||
constexpr size_t buffer_bytes = 10; // The size of struct Scores in detection_output_gpu_ref.cl
|
||||
size_t buffer_stride = num_prior_boxes * buffer_bytes;
|
||||
size_t buffer_size = num_of_images * num_classes * buffer_stride;
|
||||
size_t num_scores_size = num_of_images * (num_classes + 1) * sizeof(int);
|
||||
size_t num_scores_size = num_of_images * (num_classes + 2) * sizeof(int);
|
||||
|
||||
kd.internalBufferSizes.push_back(buffer_size);
|
||||
if (detectOutParams.detectOutParams.decrease_label_id) {
|
||||
|
@ -70,6 +70,7 @@ public:
|
||||
|
||||
protected:
|
||||
virtual JitConstants GetJitConstants(const detection_output_params& params) const;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
void SetKernelArguments(const detection_output_params& params, clKernelData& kernel, size_t idx) const;
|
||||
};
|
||||
} // namespace kernel_selector
|
||||
|
@ -57,7 +57,7 @@
|
||||
// =================================================================================================================
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
#define NUM_CLASSES_ACC (NUM_CLASSES + 1)
|
||||
#define NUM_CLASSES_ACC (NUM_CLASSES + 2)
|
||||
|
||||
typedef struct __attribute__((__packed__)) {
|
||||
short classId;
|
||||
@ -164,11 +164,11 @@ inline int FUNC(get_accumulated_detections)(__global int* size_buf, int batch_id
|
||||
return acc_num;
|
||||
}
|
||||
|
||||
inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id) {
|
||||
inline int FUNC(get_start_idx)(__global int* size_buf, int batch_id, int offset) {
|
||||
int start_idx = 0;
|
||||
for (uint idx_batch = 0; idx_batch < batch_id; idx_batch++)
|
||||
{
|
||||
const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES];
|
||||
const int num_det = size_buf[idx_batch * NUM_CLASSES_ACC + NUM_CLASSES + offset];
|
||||
start_idx += (num_det > KEEP_TOP_K ? KEEP_TOP_K: num_det);
|
||||
}
|
||||
return start_idx;
|
||||
@ -658,16 +658,20 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
|
||||
__global int *buffer1) {
|
||||
const int batchId = get_global_id(0);
|
||||
|
||||
__local int class_offset[NUM_CLASSES_ACC];
|
||||
|
||||
const int total_det = FUNC_CALL(get_accumulated_detections)(buffer1, batchId);
|
||||
buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES] = total_det;
|
||||
// the total number of detections is also stored in the extra space of buffer
|
||||
// for case where the number of detections is larger than keep_top_k
|
||||
buffer1[batchId * NUM_CLASSES_ACC + NUM_CLASSES + 1] = total_det;
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
if (KEEP_TOP_K > -1 && total_det > KEEP_TOP_K) {
|
||||
__global SCORES_INFO *scoresList = (__global SCORES_INFO*)&buffer0[0];
|
||||
int num_det = 0;
|
||||
int scores_offset = (batchId * NUM_CLASSES * NUM_OF_PRIORS);
|
||||
int scores_size_offset = batchId * NUM_CLASSES_ACC;
|
||||
|
||||
for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
|
||||
const int acc_num = buffer1[scores_size_offset + idx_class];
|
||||
|
||||
@ -689,18 +693,22 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
|
||||
}
|
||||
|
||||
// calculate starting point of each class
|
||||
class_offset[0] = 0;
|
||||
// store the current number of detections for buffer reuse
|
||||
int prev_offset = buffer1[scores_size_offset];
|
||||
buffer1[scores_size_offset] = 0;
|
||||
for (int i = 1; i < NUM_CLASSES_ACC; ++i) {
|
||||
class_offset[i] = class_offset[i - 1] + buffer1[scores_size_offset + i - 1];
|
||||
int cur_offset = buffer1[scores_size_offset + i];
|
||||
buffer1[scores_size_offset + i] = buffer1[scores_size_offset + i - 1] + prev_offset;
|
||||
prev_offset = cur_offset;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
|
||||
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 1);
|
||||
for (uint idx_num_det = 0; idx_num_det < KEEP_TOP_K; idx_num_det++) {
|
||||
SCORES_INFO score_info;
|
||||
score_info = scoresList[scores_offset + idx_num_det];
|
||||
const int idx = startIdx + class_offset[score_info.classId];
|
||||
const int idx = startIdx + buffer1[scores_size_offset + score_info.classId];
|
||||
output[idx * OUTPUT_ROW_SIZE] = TO_OUTPUT_TYPE(batchId);
|
||||
output[idx * OUTPUT_ROW_SIZE + 1] = TO_OUTPUT_TYPE((DECREASE_LABEL_ID) ? score_info.classId - 1 : score_info.classId);
|
||||
output[idx * OUTPUT_ROW_SIZE + 2] = TO_OUTPUT_TYPE(score_info.score);
|
||||
@ -719,10 +727,11 @@ KERNEL (detection_output_stage_final_caffe)(__global INPUT0_TYPE* input_location
|
||||
ymax = max(TO_INPUT0_TYPE(0.0), min(TO_INPUT0_TYPE(1.0), ymax));
|
||||
}
|
||||
vstore4((OUTPUT_TYPE4)(xmin, ymin, xmax, ymax), 0, output + idx * OUTPUT_ROW_SIZE + 3);
|
||||
class_offset[score_info.classId]++;
|
||||
// increase starting point for the next detection in class
|
||||
buffer1[scores_size_offset + score_info.classId]++;
|
||||
}
|
||||
} else {
|
||||
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId);
|
||||
const int startIdx = FUNC_CALL(get_start_idx)(buffer1, batchId, 0);
|
||||
int outputIdx = 0;
|
||||
for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++) {
|
||||
int scores_size_offset = batchId * NUM_CLASSES_ACC + idx_class;
|
||||
|
@ -1081,11 +1081,14 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
|
||||
if (!_forcing_map.empty() && _forcing_map.count(node.id()) != 0) {
|
||||
preferred_impl = _forcing_map.at(node.id()).second;
|
||||
} else if (node.is_type<detection_output>()) {
|
||||
const auto& program = node.get_program();
|
||||
const auto& device_info = program.get_engine().get_device_info();
|
||||
const size_t lws_max = device_info.max_work_group_size;
|
||||
auto& detection_output_node = node.as<detection_output>();
|
||||
auto confidence_layout = detection_output_node.confidence().get_output_layout();
|
||||
auto prim = detection_output_node.get_primitive();
|
||||
if (confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 && prim->top_k <= 400 &&
|
||||
prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
|
||||
if (confidence_layout.size.batch[0] <= lws_max && confidence_layout.size.batch[0] >= 4 && prim->confidence_threshold >= 0.1 &&
|
||||
prim->top_k <= 400 && prim->num_classes >= 16 && confidence_layout.size.feature[0] > 10000)
|
||||
preferred_impl = impl_types::ocl;
|
||||
else
|
||||
preferred_impl = impl_types::cpu;
|
||||
|
Loading…
Reference in New Issue
Block a user