[GPU] add blocked format to experimental detectron detection output (#12561)

* [GPU] add blocked format to experimental detectron detection output

* [GPU] add this kernel in whitelist of program.cpp

Co-authored-by: ozhydkov-lohika <ozhydkov@lohika.com>
This commit is contained in:
OlehKravchyshyn
2022-11-01 02:50:10 +02:00
committed by GitHub
parent 4e519eb110
commit ebbf5e3f10
7 changed files with 160 additions and 39 deletions

View File

@@ -20,7 +20,7 @@ layout experimental_detectron_detection_output_inst::calc_output_layout(
const layout data_layout = impl_param.get_input_layout();
auto desc = impl_param.typed_desc<experimental_detectron_detection_output>();
return layout(data_layout.data_type, format::bfyx, {static_cast<int>(desc->max_detections_per_image), 4, 1, 1});
return layout(data_layout.data_type, data_layout.format, {static_cast<int>(desc->max_detections_per_image), 4, 1, 1});
}
std::string experimental_detectron_detection_output_inst::to_string(

View File

@@ -70,10 +70,17 @@ public:
namespace detail {
attach_experimental_detectron_detection_output_impl::attach_experimental_detectron_detection_output_impl() {
implementation_map<experimental_detectron_detection_output>::add(
impl_types::ocl,
experimental_detectron_detection_output_impl::create,
{std::make_tuple(data_types::f16, format::bfyx), std::make_tuple(data_types::f32, format::bfyx)});
const std::vector<data_types> types {data_types::f16, data_types::f32};
const std::vector<format::type> formats = {format::bfyx,
format::b_fs_yx_fsv16,
format::b_fs_yx_fsv32,
format::bs_fs_yx_bsv16_fsv16,
format::bs_fs_yx_bsv32_fsv32,
format::bs_fs_yx_bsv32_fsv16};
implementation_map<experimental_detectron_detection_output>::add(
impl_types::ocl, experimental_detectron_detection_output_impl::create,
types, formats);
}
} // namespace detail
} // namespace ocl

View File

@@ -25,6 +25,7 @@
#include "intel_gpu/primitives/depth_to_space.hpp"
#include "intel_gpu/primitives/detection_output.hpp"
#include "intel_gpu/primitives/eltwise.hpp"
#include "intel_gpu/primitives/experimental_detectron_detection_output.hpp"
#include "intel_gpu/primitives/experimental_detectron_prior_grid_generator.hpp"
#include "intel_gpu/primitives/experimental_detectron_roi_feature_extractor.hpp"
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"

View File

@@ -1451,7 +1451,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
prim.type() != cldnn::generate_proposals::type_id() &&
prim.type() != cldnn::reverse::type_id() &&
prim.type() != cldnn::reorg_yolo::type_id() &&
prim.type() != cldnn::scatter_elements_update::type_id()) {
prim.type() != cldnn::scatter_elements_update::type_id() &&
prim.type() != cldnn::experimental_detectron_detection_output::type_id()) {
can_use_fsv16 = false;
}
@@ -1494,6 +1495,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
prim.type() != cldnn::reverse::type_id() &&
prim.type() != cldnn::reorg_yolo::type_id() &&
prim.type() != cldnn::scatter_elements_update::type_id() &&
prim.type() != cldnn::experimental_detectron_detection_output::type_id() &&
prim.type() != cldnn::deconvolution::type_id()) {
can_use_bs_fs_yx_bsv16_fsv16 = false;
}

View File

@@ -138,7 +138,15 @@ KERNEL(eddo_ref_stage_0)
size_t class_idx = get_global_id(1);
#endif
#ifdef USE_BLOCKED_FORMAT
INPUT_TYPE4 box;
box[0] = boxes[INPUT0_GET_INDEX(roi_idx, 0, 0, 0)];
box[1] = boxes[INPUT0_GET_INDEX(roi_idx, 1, 0, 0)];
box[2] = boxes[INPUT0_GET_INDEX(roi_idx, 2, 0, 0)];
box[3] = boxes[INPUT0_GET_INDEX(roi_idx, 3, 0, 0)];
#else
INPUT_TYPE4 box = vload4(roi_idx, boxes);
#endif
if (any(islessequal(box.hi - box.lo, ZERO2))) {
const int refined_offset = roi_count * class_idx + roi_idx;
@@ -152,7 +160,16 @@ KERNEL(eddo_ref_stage_0)
// center location of box
const INPUT_TYPE2 center = box.lo + HALF_ONE * box_size;
#ifdef USE_BLOCKED_FORMAT
INPUT_TYPE4 delta;
delta[0] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4, 0, 0)];
delta[1] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 1, 0, 0)];
delta[2] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 2, 0, 0)];
delta[3] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 3, 0, 0)];
delta = delta / DELTA_WEIGHTS;
#else
const INPUT_TYPE4 delta = vload4(offset, deltas) / DELTA_WEIGHTS;
#endif
// new center location according to deltas (dx, dy)
const INPUT_TYPE2 new_center = delta.lo * box_size + center;
@@ -164,7 +181,15 @@ KERNEL(eddo_ref_stage_0)
(INPUT_TYPE4)(new_center - HALF_ONE * new_size, new_center + HALF_ONE * new_size - COORDINATE_OFFSET);
// adjust new corner locations to be within the image region
#ifdef USE_BLOCKED_FORMAT
INPUT_TYPE2 img_size;
size_t img_idx1 = INPUT3_GET_INDEX(0, 1, 0, 0);
size_t img_idx0 = INPUT3_GET_INDEX(0, 0, 0, 0);
img_size[0] = im_info[img_idx1];
img_size[1] = im_info[img_idx0];
#else
const INPUT_TYPE2 img_size = vload2(0, im_info).s10;
#endif
new_box = fmax(new_box, ZERO4);
// recompute new width & height
@@ -173,7 +198,13 @@ KERNEL(eddo_ref_stage_0)
const int refined_offset = roi_count * class_idx + roi_idx;
vstore4(new_box, refined_offset, refined_boxes);
refined_box_areas[refined_offset] = new_box_size.x * new_box_size.y;
#ifdef USE_BLOCKED_FORMAT
const int scores_offset = INPUT2_GET_INDEX(roi_idx, class_idx, 0, 0);
refined_scores[refined_offset] = scores[scores_offset];
#else
refined_scores[refined_offset] = scores[offset];
#endif
}
}
@@ -295,17 +326,48 @@ KERNEL(eddo_ref_stage_3)
__global OUTPUT_TYPE* output_scores) {
size_t i = get_global_id(0);
#ifdef USE_BLOCKED_FORMAT
size_t idx0 = OUTPUT_GET_INDEX(i, 0, 0, 0);
size_t idx1 = OUTPUT_GET_INDEX(i, 1, 0, 0);
size_t idx2 = OUTPUT_GET_INDEX(i, 2, 0, 0);
size_t idx3 = OUTPUT_GET_INDEX(i, 3, 0, 0);
size_t idx_i4 = INPUT4_GET_INDEX(i, 0, 0, 0);
size_t idx_i5 = INPUT5_GET_INDEX(i, 0, 0, 0);
#endif
if (i < *detection_count) {
OUTPUT_TYPE score = score_class_index_map[i].score;
OUTPUT_INDICES_TYPE cls = score_class_index_map[i].class_idx;
OUTPUT_INDICES_TYPE idx = score_class_index_map[i].box_idx;
#ifdef USE_BLOCKED_FORMAT
INPUT_TYPE4 res = vload4(ROI_COUNT * cls + idx, refined_boxes);
output_boxes[idx0] = res[0];
output_boxes[idx1] = res[1];
output_boxes[idx2] = res[2];
output_boxes[idx3] = res[3];
output_scores[idx_i4] = score;
output_classes[idx_i5] = cls;
#else
vstore4(vload4(ROI_COUNT * cls + idx, refined_boxes), i, output_boxes);
output_scores[i] = score;
output_classes[i] = cls;
#endif
} else {
#ifdef USE_BLOCKED_FORMAT
output_boxes[idx0] = ZERO;
output_boxes[idx1] = ZERO;
output_boxes[idx2] = ZERO;
output_boxes[idx3] = ZERO;
output_scores[idx_i4] = ZERO;
output_classes[idx_i5] = 0;
#else
vstore4(ZERO4, i, output_boxes);
output_scores[i] = ZERO;
output_classes[i] = 0;
#endif
}
}

View File

@@ -21,10 +21,12 @@ ParamsKey ExperimentalDetectronDetectionOutputKernelRef::GetSupportedKey() const
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableAllInputLayout();
k.EnableAllOutputLayout();
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableTensorPitches();
return k;
}
@@ -82,7 +84,9 @@ JitConstants ExperimentalDetectronDetectionOutputKernelRef::GetJitConstants(
if (params.class_agnostic_box_regression) {
jit.AddConstant(MakeJitConstant("CLASS_AGNOSTIC_BOX_REGRESSION", true));
}
if (!SimpleLayout(params.inputs[0].GetLayout())) {
jit.AddConstant(MakeJitConstant("USE_BLOCKED_FORMAT", true));
}
return jit;
}

View File

@@ -57,13 +57,18 @@ struct ExperimentalDetectronDetectionOutputParams {
std::vector<T> expected_scores;
};
template <typename T>
using ExperimentalDetectronDetectionOutputParamsWithLayout =
std::tuple<ExperimentalDetectronDetectionOutputParams<T>, format::type>;
template <typename T>
struct experimental_detectron_detection_output_test
: public ::testing::TestWithParam<ExperimentalDetectronDetectionOutputParams<T>> {
: public ::testing::TestWithParam<ExperimentalDetectronDetectionOutputParamsWithLayout<T>> {
public:
void test() {
const ExperimentalDetectronDetectionOutputParams<T> param =
testing::TestWithParam<ExperimentalDetectronDetectionOutputParams<T>>::GetParam();
ExperimentalDetectronDetectionOutputParams<T> param;
format::type fmt;
std::tie(param, fmt) = this->GetParam();
auto data_type = type_to_data_type<T>::value;
auto& engine = get_test_engine();
@@ -87,32 +92,42 @@ public:
const auto input_im_info = engine.allocate_memory({data_type, format::bfyx, tensor{batch(1), feature(3)}});
set_values(input_im_info, param.im_info);
const primitive_id output_scores_id = "OutputScores";
auto output_scores =
engine.allocate_memory({data_type, format::bfyx, tensor{batch(param.max_detections_per_image)}});
const primitive_id output_classes_id = "OutputClasses";
auto output_classes =
engine.allocate_memory({data_types::i32, format::bfyx, tensor{batch(param.max_detections_per_image)}});
topology topology;
topology.add(input_layout(input_boxes_id, input_boxes->get_layout()));
topology.add(input_layout(input_deltas_id, input_deltas->get_layout()));
topology.add(input_layout(input_scores_id, input_scores->get_layout()));
topology.add(input_layout(input_im_info_id, input_im_info->get_layout()));
topology.add(mutable_data(output_classes_id, output_classes));
topology.add(mutable_data(output_scores_id, output_scores));
const primitive_id b_input_boxes_id = "BlockedInputBoxes";
const primitive_id b_input_deltas_id = "BlockedInputDeltas";
const primitive_id b_input_scores_id = "BlockedInputScores";
const primitive_id b_input_im_info_id = "BlockedInputImInfo";
const primitive_id eddo_id = "experimental_detectron_detection_output";
const auto eddo_primitive = experimental_detectron_detection_output{
eddo_id,
input_boxes_id,
input_deltas_id,
input_scores_id,
input_im_info_id,
output_classes_id,
output_scores_id,
topology.add(reorder(b_input_boxes_id, input_boxes_id, fmt, data_type));
topology.add(reorder(b_input_deltas_id, input_deltas_id, fmt, data_type));
topology.add(reorder(b_input_scores_id, input_scores_id, fmt, data_type));
topology.add(reorder(b_input_im_info_id, input_im_info_id, fmt, data_type));
const primitive_id b_output_scores_id = "BlockedOutputScores";
const primitive_id b_output_classes_id = "BlockedOutputClasses";
const layout output_scores_layout = {data_type, fmt, tensor{batch(param.max_detections_per_image)}};
auto b_output_scores = engine.allocate_memory(output_scores_layout);
const layout output_classes_layout = {data_types::i32, fmt, tensor{batch(param.max_detections_per_image)}};
auto b_output_classes = engine.allocate_memory(output_classes_layout);
topology.add(mutable_data(b_output_classes_id, b_output_classes));
topology.add(mutable_data(b_output_scores_id, b_output_scores));
const primitive_id b_eddo_id = "blocked_experimental_detectron_detection_output";
const auto b_eddo_primitive = experimental_detectron_detection_output{
b_eddo_id,
b_input_boxes_id,
b_input_deltas_id,
b_input_scores_id,
b_input_im_info_id,
b_output_classes_id,
b_output_scores_id,
param.score_threshold,
param.nms_threshold,
param.num_classes,
@@ -123,7 +138,9 @@ public:
param.deltas_weights,
};
topology.add(eddo_primitive);
topology.add(b_eddo_primitive);
const primitive_id eddo_id = "experimental_detectron_detection_output";
topology.add(reorder(eddo_id, b_eddo_primitive /*b_eddo_id*/, format::bfyx, data_type));
network network(engine, topology);
@@ -131,20 +148,34 @@ public:
network.set_input_data(input_deltas_id, input_deltas);
network.set_input_data(input_scores_id, input_scores);
network.set_input_data(input_im_info_id, input_im_info);
const auto outputs = network.execute();
const auto output_boxes = outputs.at(eddo_id).get_memory();
const cldnn::mem_lock<T> output_boxes_ptr(output_boxes, get_test_stream());
ASSERT_EQ(output_boxes_ptr.size(), param.max_detections_per_image * 4);
const cldnn::mem_lock<int32_t> output_classes_ptr(output_classes, get_test_stream());
ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image);
const primitive_id output_scores_id = "OutputScores";
cldnn::topology reorder_score_topology;
reorder_score_topology.add(input_layout(b_output_scores_id, output_scores_layout));
reorder_score_topology.add(reorder(output_scores_id, b_output_scores_id, format::bfyx, data_type));
cldnn::network reorder_score_net{engine, reorder_score_topology};
reorder_score_net.set_input_data(b_output_scores_id, b_output_scores);
const auto score_result = reorder_score_net.execute();
const auto output_scores = score_result.at(output_scores_id).get_memory();
const cldnn::mem_lock<T> output_scores_ptr(output_scores, get_test_stream());
ASSERT_EQ(output_scores_ptr.size(), param.max_detections_per_image);
const primitive_id output_classes_id = "OutputClasses";
cldnn::topology reorder_classes_topology;
reorder_classes_topology.add(input_layout(b_output_classes_id, output_classes_layout));
reorder_classes_topology.add(reorder(output_classes_id, b_output_classes_id, format::bfyx, data_types::i32));
cldnn::network reorder_classes_net{engine, reorder_classes_topology};
reorder_classes_net.set_input_data(b_output_classes_id, b_output_classes);
const auto classes_result = reorder_classes_net.execute();
const auto output_classes = classes_result.at(output_classes_id).get_memory();
const cldnn::mem_lock<int32_t> output_classes_ptr(output_classes, get_test_stream());
ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image);
const auto& expected_boxes = param.expected_boxes;
const auto& expected_classes = param.expected_classes;
const auto& expected_scores = param.expected_scores;
@@ -172,6 +203,14 @@ TEST_P(experimental_detectron_detection_output_test_f16, basic) {
ASSERT_NO_FATAL_FAILURE(test());
}
const std::vector<format::type> layouts{
format::bfyx,
format::b_fs_yx_fsv16,
format::b_fs_yx_fsv32,
format::bs_fs_yx_bsv16_fsv16,
format::bs_fs_yx_bsv32_fsv16,
format::bs_fs_yx_bsv32_fsv32};
template <typename T>
std::vector<ExperimentalDetectronDetectionOutputParams<T>> getExperimentalDetectronDetectionOutputParams() {
std::vector<ExperimentalDetectronDetectionOutputParams<T>> params = {
@@ -390,8 +429,14 @@ std::vector<ExperimentalDetectronDetectionOutputParams<T>> getExperimentalDetect
INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test,
experimental_detectron_detection_output_test_f32,
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<float>()));
testing::Combine(
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<float>()),
::testing::ValuesIn(layouts)
));
INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test,
experimental_detectron_detection_output_test_f16,
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<half_t>()));
testing::Combine(
::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams<half_t>()),
::testing::ValuesIn(layouts)
));