[GPU] add blocked format to experimental detectron detection output (#12561)
* [GPU] add blocked format to experimental detectron detection output * [GPU] add this kernel in whitelist of program.cpp Co-authored-by: ozhydkov-lohika <ozhydkov@lohika.com>
This commit is contained in:
@@ -20,7 +20,7 @@ layout experimental_detectron_detection_output_inst::calc_output_layout(
|
||||
const layout data_layout = impl_param.get_input_layout();
|
||||
auto desc = impl_param.typed_desc<experimental_detectron_detection_output>();
|
||||
|
||||
return layout(data_layout.data_type, format::bfyx, {static_cast<int>(desc->max_detections_per_image), 4, 1, 1});
|
||||
return layout(data_layout.data_type, data_layout.format, {static_cast<int>(desc->max_detections_per_image), 4, 1, 1});
|
||||
}
|
||||
|
||||
std::string experimental_detectron_detection_output_inst::to_string(
|
||||
|
||||
@@ -70,10 +70,17 @@ public:
|
||||
|
||||
namespace detail {
|
||||
attach_experimental_detectron_detection_output_impl::attach_experimental_detectron_detection_output_impl() {
|
||||
implementation_map<experimental_detectron_detection_output>::add(
|
||||
impl_types::ocl,
|
||||
experimental_detectron_detection_output_impl::create,
|
||||
{std::make_tuple(data_types::f16, format::bfyx), std::make_tuple(data_types::f32, format::bfyx)});
|
||||
const std::vector<data_types> types {data_types::f16, data_types::f32};
|
||||
const std::vector<format::type> formats = {format::bfyx,
|
||||
format::b_fs_yx_fsv16,
|
||||
format::b_fs_yx_fsv32,
|
||||
format::bs_fs_yx_bsv16_fsv16,
|
||||
format::bs_fs_yx_bsv32_fsv32,
|
||||
format::bs_fs_yx_bsv32_fsv16};
|
||||
|
||||
implementation_map<experimental_detectron_detection_output>::add(
|
||||
impl_types::ocl, experimental_detectron_detection_output_impl::create,
|
||||
types, formats);
|
||||
}
|
||||
} // namespace detail
|
||||
} // namespace ocl
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "intel_gpu/primitives/depth_to_space.hpp"
|
||||
#include "intel_gpu/primitives/detection_output.hpp"
|
||||
#include "intel_gpu/primitives/eltwise.hpp"
|
||||
#include "intel_gpu/primitives/experimental_detectron_detection_output.hpp"
|
||||
#include "intel_gpu/primitives/experimental_detectron_prior_grid_generator.hpp"
|
||||
#include "intel_gpu/primitives/experimental_detectron_roi_feature_extractor.hpp"
|
||||
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
|
||||
|
||||
@@ -1451,7 +1451,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
|
||||
prim.type() != cldnn::generate_proposals::type_id() &&
|
||||
prim.type() != cldnn::reverse::type_id() &&
|
||||
prim.type() != cldnn::reorg_yolo::type_id() &&
|
||||
prim.type() != cldnn::scatter_elements_update::type_id()) {
|
||||
prim.type() != cldnn::scatter_elements_update::type_id() &&
|
||||
prim.type() != cldnn::experimental_detectron_detection_output::type_id()) {
|
||||
can_use_fsv16 = false;
|
||||
}
|
||||
|
||||
@@ -1494,6 +1495,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
|
||||
prim.type() != cldnn::reverse::type_id() &&
|
||||
prim.type() != cldnn::reorg_yolo::type_id() &&
|
||||
prim.type() != cldnn::scatter_elements_update::type_id() &&
|
||||
prim.type() != cldnn::experimental_detectron_detection_output::type_id() &&
|
||||
prim.type() != cldnn::deconvolution::type_id()) {
|
||||
can_use_bs_fs_yx_bsv16_fsv16 = false;
|
||||
}
|
||||
|
||||
@@ -138,7 +138,15 @@ KERNEL(eddo_ref_stage_0)
|
||||
size_t class_idx = get_global_id(1);
|
||||
#endif
|
||||
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
INPUT_TYPE4 box;
|
||||
box[0] = boxes[INPUT0_GET_INDEX(roi_idx, 0, 0, 0)];
|
||||
box[1] = boxes[INPUT0_GET_INDEX(roi_idx, 1, 0, 0)];
|
||||
box[2] = boxes[INPUT0_GET_INDEX(roi_idx, 2, 0, 0)];
|
||||
box[3] = boxes[INPUT0_GET_INDEX(roi_idx, 3, 0, 0)];
|
||||
#else
|
||||
INPUT_TYPE4 box = vload4(roi_idx, boxes);
|
||||
#endif
|
||||
|
||||
if (any(islessequal(box.hi - box.lo, ZERO2))) {
|
||||
const int refined_offset = roi_count * class_idx + roi_idx;
|
||||
@@ -152,7 +160,16 @@ KERNEL(eddo_ref_stage_0)
|
||||
// center location of box
|
||||
const INPUT_TYPE2 center = box.lo + HALF_ONE * box_size;
|
||||
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
INPUT_TYPE4 delta;
|
||||
delta[0] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4, 0, 0)];
|
||||
delta[1] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 1, 0, 0)];
|
||||
delta[2] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 2, 0, 0)];
|
||||
delta[3] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 3, 0, 0)];
|
||||
delta = delta / DELTA_WEIGHTS;
|
||||
#else
|
||||
const INPUT_TYPE4 delta = vload4(offset, deltas) / DELTA_WEIGHTS;
|
||||
#endif
|
||||
|
||||
// new center location according to deltas (dx, dy)
|
||||
const INPUT_TYPE2 new_center = delta.lo * box_size + center;
|
||||
@@ -164,7 +181,15 @@ KERNEL(eddo_ref_stage_0)
|
||||
(INPUT_TYPE4)(new_center - HALF_ONE * new_size, new_center + HALF_ONE * new_size - COORDINATE_OFFSET);
|
||||
|
||||
// adjust new corner locations to be within the image region
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
INPUT_TYPE2 img_size;
|
||||
size_t img_idx1 = INPUT3_GET_INDEX(0, 1, 0, 0);
|
||||
size_t img_idx0 = INPUT3_GET_INDEX(0, 0, 0, 0);
|
||||
img_size[0] = im_info[img_idx1];
|
||||
img_size[1] = im_info[img_idx0];
|
||||
#else
|
||||
const INPUT_TYPE2 img_size = vload2(0, im_info).s10;
|
||||
#endif
|
||||
new_box = fmax(new_box, ZERO4);
|
||||
|
||||
// recompute new width & height
|
||||
@@ -173,7 +198,13 @@ KERNEL(eddo_ref_stage_0)
|
||||
const int refined_offset = roi_count * class_idx + roi_idx;
|
||||
vstore4(new_box, refined_offset, refined_boxes);
|
||||
refined_box_areas[refined_offset] = new_box_size.x * new_box_size.y;
|
||||
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
const int scores_offset = INPUT2_GET_INDEX(roi_idx, class_idx, 0, 0);
|
||||
refined_scores[refined_offset] = scores[scores_offset];
|
||||
#else
|
||||
refined_scores[refined_offset] = scores[offset];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -295,17 +326,48 @@ KERNEL(eddo_ref_stage_3)
|
||||
__global OUTPUT_TYPE* output_scores) {
|
||||
size_t i = get_global_id(0);
|
||||
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
size_t idx0 = OUTPUT_GET_INDEX(i, 0, 0, 0);
|
||||
size_t idx1 = OUTPUT_GET_INDEX(i, 1, 0, 0);
|
||||
size_t idx2 = OUTPUT_GET_INDEX(i, 2, 0, 0);
|
||||
size_t idx3 = OUTPUT_GET_INDEX(i, 3, 0, 0);
|
||||
|
||||
size_t idx_i4 = INPUT4_GET_INDEX(i, 0, 0, 0);
|
||||
size_t idx_i5 = INPUT5_GET_INDEX(i, 0, 0, 0);
|
||||
#endif
|
||||
if (i < *detection_count) {
|
||||
OUTPUT_TYPE score = score_class_index_map[i].score;
|
||||
OUTPUT_INDICES_TYPE cls = score_class_index_map[i].class_idx;
|
||||
OUTPUT_INDICES_TYPE idx = score_class_index_map[i].box_idx;
|
||||
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
INPUT_TYPE4 res = vload4(ROI_COUNT * cls + idx, refined_boxes);
|
||||
|
||||
output_boxes[idx0] = res[0];
|
||||
output_boxes[idx1] = res[1];
|
||||
output_boxes[idx2] = res[2];
|
||||
output_boxes[idx3] = res[3];
|
||||
output_scores[idx_i4] = score;
|
||||
output_classes[idx_i5] = cls;
|
||||
#else
|
||||
vstore4(vload4(ROI_COUNT * cls + idx, refined_boxes), i, output_boxes);
|
||||
output_scores[i] = score;
|
||||
output_classes[i] = cls;
|
||||
#endif
|
||||
} else {
|
||||
|
||||
#ifdef USE_BLOCKED_FORMAT
|
||||
output_boxes[idx0] = ZERO;
|
||||
output_boxes[idx1] = ZERO;
|
||||
output_boxes[idx2] = ZERO;
|
||||
output_boxes[idx3] = ZERO;
|
||||
output_scores[idx_i4] = ZERO;
|
||||
output_classes[idx_i5] = 0;
|
||||
#else
|
||||
vstore4(ZERO4, i, output_boxes);
|
||||
output_scores[i] = ZERO;
|
||||
output_classes[i] = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,10 +21,12 @@ ParamsKey ExperimentalDetectronDetectionOutputKernelRef::GetSupportedKey() const
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::INT32);
|
||||
k.EnableOutputDataType(Datatype::INT64);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
|
||||
k.EnableAllInputLayout();
|
||||
k.EnableAllOutputLayout();
|
||||
k.EnableBatching();
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableTensorPitches();
|
||||
return k;
|
||||
}
|
||||
|
||||
@@ -82,7 +84,9 @@ JitConstants ExperimentalDetectronDetectionOutputKernelRef::GetJitConstants(
|
||||
if (params.class_agnostic_box_regression) {
|
||||
jit.AddConstant(MakeJitConstant("CLASS_AGNOSTIC_BOX_REGRESSION", true));
|
||||
}
|
||||
|
||||
if (!SimpleLayout(params.inputs[0].GetLayout())) {
|
||||
jit.AddConstant(MakeJitConstant("USE_BLOCKED_FORMAT", true));
|
||||
}
|
||||
return jit;
|
||||
}
|
||||
|
||||
|
||||
@@ -57,13 +57,18 @@ struct ExperimentalDetectronDetectionOutputParams {
|
||||
std::vector<T> expected_scores;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using ExperimentalDetectronDetectionOutputParamsWithLayout =
|
||||
std::tuple<ExperimentalDetectronDetectionOutputParams<T>, format::type>;
|
||||
|
||||
template <typename T>
|
||||
struct experimental_detectron_detection_output_test
|
||||
: public ::testing::TestWithParam<ExperimentalDetectronDetectionOutputParams<T>> {
|
||||
: public ::testing::TestWithParam<ExperimentalDetectronDetectionOutputParamsWithLayout<T>> {
|
||||
public:
|
||||
void test() {
|
||||
const ExperimentalDetectronDetectionOutputParams<T> param =
|
||||
testing::TestWithParam<ExperimentalDetectronDetectionOutputParams<T>>::GetParam();
|
||||
ExperimentalDetectronDetectionOutputParams<T> param;
|
||||
format::type fmt;
|
||||
std::tie(param, fmt) = this->GetParam();
|
||||
auto data_type = type_to_data_type<T>::value;
|
||||
|
||||
auto& engine = get_test_engine();
|
||||
@@ -87,32 +92,42 @@ public:
|
||||
const auto input_im_info = engine.allocate_memory({data_type, format::bfyx, tensor{batch(1), feature(3)}});
|
||||
set_values(input_im_info, param.im_info);
|
||||
|
||||
const primitive_id output_scores_id = "OutputScores";
|
||||
auto output_scores =
|
||||
engine.allocate_memory({data_type, format::bfyx, tensor{batch(param.max_detections_per_image)}});
|
||||
|
||||
const primitive_id output_classes_id = "OutputClasses";
|
||||
auto output_classes =
|
||||
engine.allocate_memory({data_types::i32, format::bfyx, tensor{batch(param.max_detections_per_image)}});
|
||||
|
||||
topology topology;
|
||||
|
||||
topology.add(input_layout(input_boxes_id, input_boxes->get_layout()));
|
||||
topology.add(input_layout(input_deltas_id, input_deltas->get_layout()));
|
||||
topology.add(input_layout(input_scores_id, input_scores->get_layout()));
|
||||
topology.add(input_layout(input_im_info_id, input_im_info->get_layout()));
|
||||
topology.add(mutable_data(output_classes_id, output_classes));
|
||||
topology.add(mutable_data(output_scores_id, output_scores));
|
||||
|
||||
const primitive_id b_input_boxes_id = "BlockedInputBoxes";
|
||||
const primitive_id b_input_deltas_id = "BlockedInputDeltas";
|
||||
const primitive_id b_input_scores_id = "BlockedInputScores";
|
||||
const primitive_id b_input_im_info_id = "BlockedInputImInfo";
|
||||
|
||||
const primitive_id eddo_id = "experimental_detectron_detection_output";
|
||||
const auto eddo_primitive = experimental_detectron_detection_output{
|
||||
eddo_id,
|
||||
input_boxes_id,
|
||||
input_deltas_id,
|
||||
input_scores_id,
|
||||
input_im_info_id,
|
||||
output_classes_id,
|
||||
output_scores_id,
|
||||
topology.add(reorder(b_input_boxes_id, input_boxes_id, fmt, data_type));
|
||||
topology.add(reorder(b_input_deltas_id, input_deltas_id, fmt, data_type));
|
||||
topology.add(reorder(b_input_scores_id, input_scores_id, fmt, data_type));
|
||||
topology.add(reorder(b_input_im_info_id, input_im_info_id, fmt, data_type));
|
||||
|
||||
const primitive_id b_output_scores_id = "BlockedOutputScores";
|
||||
const primitive_id b_output_classes_id = "BlockedOutputClasses";
|
||||
const layout output_scores_layout = {data_type, fmt, tensor{batch(param.max_detections_per_image)}};
|
||||
auto b_output_scores = engine.allocate_memory(output_scores_layout);
|
||||
const layout output_classes_layout = {data_types::i32, fmt, tensor{batch(param.max_detections_per_image)}};
|
||||
auto b_output_classes = engine.allocate_memory(output_classes_layout);
|
||||
|
||||
topology.add(mutable_data(b_output_classes_id, b_output_classes));
|
||||
topology.add(mutable_data(b_output_scores_id, b_output_scores));
|
||||
|
||||
const primitive_id b_eddo_id = "blocked_experimental_detectron_detection_output";
|
||||
const auto b_eddo_primitive = experimental_detectron_detection_output{
|
||||
b_eddo_id,
|
||||
b_input_boxes_id,
|
||||
b_input_deltas_id,
|
||||
b_input_scores_id,
|
||||
b_input_im_info_id,
|
||||
b_output_classes_id,
|
||||
b_output_scores_id,
|
||||
param.score_threshold,
|
||||
param.nms_threshold,
|
||||
param.num_classes,
|
||||
@@ -123,7 +138,9 @@ public:
|
||||
param.deltas_weights,
|
||||
};
|
||||
|
||||
topology.add(eddo_primitive);
|
||||
topology.add(b_eddo_primitive);
|
||||
const primitive_id eddo_id = "experimental_detectron_detection_output";
|
||||
topology.add(reorder(eddo_id, b_eddo_primitive /*b_eddo_id*/, format::bfyx, data_type));
|
||||
|
||||
network network(engine, topology);
|
||||
|
||||
@@ -131,20 +148,34 @@ public:
|
||||
network.set_input_data(input_deltas_id, input_deltas);
|
||||
network.set_input_data(input_scores_id, input_scores);
|
||||
network.set_input_data(input_im_info_id, input_im_info);
|
||||
|
||||
const auto outputs = network.execute();
|
||||
|
||||
const auto output_boxes = outputs.at(eddo_id).get_memory();
|
||||
|
||||
const cldnn::mem_lock<T> output_boxes_ptr(output_boxes, get_test_stream());
|
||||
ASSERT_EQ(output_boxes_ptr.size(), param.max_detections_per_image * 4);
|
||||
|
||||
const cldnn::mem_lock<int32_t> output_classes_ptr(output_classes, get_test_stream());
|
||||
ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image);
|
||||
|
||||
const primitive_id output_scores_id = "OutputScores";
|
||||
cldnn::topology reorder_score_topology;
|
||||
reorder_score_topology.add(input_layout(b_output_scores_id, output_scores_layout));
|
||||
reorder_score_topology.add(reorder(output_scores_id, b_output_scores_id, format::bfyx, data_type));
|
||||
cldnn::network reorder_score_net{engine, reorder_score_topology};
|
||||
reorder_score_net.set_input_data(b_output_scores_id, b_output_scores);
|
||||
const auto score_result = reorder_score_net.execute();
|
||||
const auto output_scores = score_result.at(output_scores_id).get_memory();
|
||||
const cldnn::mem_lock<T> output_scores_ptr(output_scores, get_test_stream());
|
||||
ASSERT_EQ(output_scores_ptr.size(), param.max_detections_per_image);
|
||||
|
||||
const primitive_id output_classes_id = "OutputClasses";
|
||||
cldnn::topology reorder_classes_topology;
|
||||
reorder_classes_topology.add(input_layout(b_output_classes_id, output_classes_layout));
|
||||
reorder_classes_topology.add(reorder(output_classes_id, b_output_classes_id, format::bfyx, data_types::i32));
|
||||
cldnn::network reorder_classes_net{engine, reorder_classes_topology};
|
||||
reorder_classes_net.set_input_data(b_output_classes_id, b_output_classes);
|
||||
const auto classes_result = reorder_classes_net.execute();
|
||||
const auto output_classes = classes_result.at(output_classes_id).get_memory();
|
||||
const cldnn::mem_lock<int32_t> output_classes_ptr(output_classes, get_test_stream());
|
||||
ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image);
|
||||
|
||||
const auto& expected_boxes = param.expected_boxes;
|
||||
const auto& expected_classes = param.expected_classes;
|
||||
const auto& expected_scores = param.expected_scores;
|
||||
@@ -172,6 +203,14 @@ TEST_P(experimental_detectron_detection_output_test_f16, basic) {
|
||||
ASSERT_NO_FATAL_FAILURE(test());
|
||||
}
|
||||
|
||||
const std::vector<format::type> layouts{
|
||||
format::bfyx,
|
||||
format::b_fs_yx_fsv16,
|
||||
format::b_fs_yx_fsv32,
|
||||
format::bs_fs_yx_bsv16_fsv16,
|
||||
format::bs_fs_yx_bsv32_fsv16,
|
||||
format::bs_fs_yx_bsv32_fsv32};
|
||||
|
||||
template <typename T>
|
||||
std::vector<ExperimentalDetectronDetectionOutputParams<T>> getExperimentalDetectronDetectionOutputParams() {
|
||||
std::vector<ExperimentalDetectronDetectionOutputParams<T>> params = {
|
||||
@@ -390,8 +429,14 @@ std::vector<ExperimentalDetectronDetectionOutputParams<T>> getExperimentalDetect
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test,
|
||||
experimental_detectron_detection_output_test_f32,
|
||||
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<float>()));
|
||||
testing::Combine(
|
||||
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<float>()),
|
||||
::testing::ValuesIn(layouts)
|
||||
));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test,
|
||||
experimental_detectron_detection_output_test_f16,
|
||||
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<half_t>()));
|
||||
testing::Combine(
|
||||
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<half_t>()),
|
||||
::testing::ValuesIn(layouts)
|
||||
));
|
||||
|
||||
Reference in New Issue
Block a user