From ebbf5e3f10ce3d5bc1b8f451c23b6d311e7502bc Mon Sep 17 00:00:00 2001 From: OlehKravchyshyn <86965088+OlehKravchyshyn@users.noreply.github.com> Date: Tue, 1 Nov 2022 02:50:10 +0200 Subject: [PATCH] [GPU] add blocked format to experimental detectron detection output (#12561) * [GPU] add blocked format to experimental detectron detection output * [GPU] add this kernel in whitelist of program.cpp Co-authored-by: ozhydkov-lohika --- ...xperimental_detectron_detection_output.cpp | 2 +- ...xperimental_detectron_detection_output.cpp | 15 ++- .../src/graph/impls/ocl/register.hpp | 1 + src/plugins/intel_gpu/src/graph/program.cpp | 4 +- ...rimental_detectron_detection_output_ref.cl | 62 +++++++++++ .../ed_do/detection_output_kernel_ref.cpp | 10 +- ...al_detectron_detection_output_gpu_test.cpp | 105 +++++++++++++----- 7 files changed, 160 insertions(+), 39 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp b/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp index 2a1c390687e..daaa48fc4c5 100644 --- a/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp +++ b/src/plugins/intel_gpu/src/graph/experimental_detectron_detection_output.cpp @@ -20,7 +20,7 @@ layout experimental_detectron_detection_output_inst::calc_output_layout( const layout data_layout = impl_param.get_input_layout(); auto desc = impl_param.typed_desc(); - return layout(data_layout.data_type, format::bfyx, {static_cast(desc->max_detections_per_image), 4, 1, 1}); + return layout(data_layout.data_type, data_layout.format, {static_cast(desc->max_detections_per_image), 4, 1, 1}); } std::string experimental_detectron_detection_output_inst::to_string( diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp index 8a8cc45c065..b148716df33 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/experimental_detectron_detection_output.cpp @@ -70,10 +70,17 @@ public: namespace detail { attach_experimental_detectron_detection_output_impl::attach_experimental_detectron_detection_output_impl() { - implementation_map::add( - impl_types::ocl, - experimental_detectron_detection_output_impl::create, - {std::make_tuple(data_types::f16, format::bfyx), std::make_tuple(data_types::f32, format::bfyx)}); + const std::vector types {data_types::f16, data_types::f32}; + const std::vector formats = {format::bfyx, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32, + format::bs_fs_yx_bsv16_fsv16, + format::bs_fs_yx_bsv32_fsv32, + format::bs_fs_yx_bsv32_fsv16}; + + implementation_map::add( + impl_types::ocl, experimental_detectron_detection_output_impl::create, + types, formats); } } // namespace detail } // namespace ocl diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index a483a10ca16..ca6372ce61a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -25,6 +25,7 @@ #include "intel_gpu/primitives/depth_to_space.hpp" #include "intel_gpu/primitives/detection_output.hpp" #include "intel_gpu/primitives/eltwise.hpp" +#include "intel_gpu/primitives/experimental_detectron_detection_output.hpp" #include "intel_gpu/primitives/experimental_detectron_prior_grid_generator.hpp" #include "intel_gpu/primitives/experimental_detectron_roi_feature_extractor.hpp" #include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp" diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 93ef1c27b0a..a643ff7ed53 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1451,7 +1451,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::generate_proposals::type_id() && prim.type() != cldnn::reverse::type_id() && prim.type() != cldnn::reorg_yolo::type_id() && - prim.type() != cldnn::scatter_elements_update::type_id()) { + prim.type() != cldnn::scatter_elements_update::type_id() && + prim.type() != cldnn::experimental_detectron_detection_output::type_id()) { can_use_fsv16 = false; } @@ -1494,6 +1495,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::reverse::type_id() && prim.type() != cldnn::reorg_yolo::type_id() && prim.type() != cldnn::scatter_elements_update::type_id() && + prim.type() != cldnn::experimental_detectron_detection_output::type_id() && prim.type() != cldnn::deconvolution::type_id()) { can_use_bs_fs_yx_bsv16_fsv16 = false; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl index 00eb0e71a5c..58dfdda19e7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/experimental_detectron_detection_output_ref.cl @@ -138,7 +138,15 @@ KERNEL(eddo_ref_stage_0) size_t class_idx = get_global_id(1); #endif +#ifdef USE_BLOCKED_FORMAT + INPUT_TYPE4 box; + box[0] = boxes[INPUT0_GET_INDEX(roi_idx, 0, 0, 0)]; + box[1] = boxes[INPUT0_GET_INDEX(roi_idx, 1, 0, 0)]; + box[2] = boxes[INPUT0_GET_INDEX(roi_idx, 2, 0, 0)]; + box[3] = boxes[INPUT0_GET_INDEX(roi_idx, 3, 0, 0)]; +#else INPUT_TYPE4 box = vload4(roi_idx, boxes); +#endif if (any(islessequal(box.hi - box.lo, ZERO2))) { const int refined_offset = roi_count * class_idx + roi_idx; @@ -152,7 +160,16 @@ KERNEL(eddo_ref_stage_0) // center location of box const INPUT_TYPE2 center = box.lo + HALF_ONE * box_size; +#ifdef USE_BLOCKED_FORMAT + INPUT_TYPE4 delta; + delta[0] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4, 0, 0)]; + delta[1] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 1, 0, 0)]; + delta[2] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 2, 0, 0)]; + delta[3] = deltas[INPUT1_GET_INDEX(roi_idx, class_idx * 4 + 3, 0, 0)]; + delta = delta / DELTA_WEIGHTS; +#else const INPUT_TYPE4 delta = vload4(offset, deltas) / DELTA_WEIGHTS; +#endif // new center location according to deltas (dx, dy) const INPUT_TYPE2 new_center = delta.lo * box_size + center; @@ -164,7 +181,15 @@ KERNEL(eddo_ref_stage_0) (INPUT_TYPE4)(new_center - HALF_ONE * new_size, new_center + HALF_ONE * new_size - COORDINATE_OFFSET); // adjust new corner locations to be within the image region +#ifdef USE_BLOCKED_FORMAT + INPUT_TYPE2 img_size; + size_t img_idx1 = INPUT3_GET_INDEX(0, 1, 0, 0); + size_t img_idx0 = INPUT3_GET_INDEX(0, 0, 0, 0); + img_size[0] = im_info[img_idx1]; + img_size[1] = im_info[img_idx0]; +#else const INPUT_TYPE2 img_size = vload2(0, im_info).s10; +#endif new_box = fmax(new_box, ZERO4); // recompute new width & height @@ -173,7 +198,13 @@ KERNEL(eddo_ref_stage_0) const int refined_offset = roi_count * class_idx + roi_idx; vstore4(new_box, refined_offset, refined_boxes); refined_box_areas[refined_offset] = new_box_size.x * new_box_size.y; + +#ifdef USE_BLOCKED_FORMAT + const int scores_offset = INPUT2_GET_INDEX(roi_idx, class_idx, 0, 0); + refined_scores[refined_offset] = scores[scores_offset]; +#else refined_scores[refined_offset] = scores[offset]; +#endif } } @@ -295,17 +326,48 @@ KERNEL(eddo_ref_stage_3) __global OUTPUT_TYPE* output_scores) { size_t i = get_global_id(0); +#ifdef USE_BLOCKED_FORMAT + size_t idx0 = OUTPUT_GET_INDEX(i, 0, 0, 0); + size_t idx1 = OUTPUT_GET_INDEX(i, 1, 0, 0); + size_t idx2 = OUTPUT_GET_INDEX(i, 2, 0, 0); + size_t idx3 = OUTPUT_GET_INDEX(i, 3, 0, 0); + + size_t idx_i4 = INPUT4_GET_INDEX(i, 0, 0, 0); + size_t idx_i5 = INPUT5_GET_INDEX(i, 0, 0, 0); +#endif if (i < *detection_count) { OUTPUT_TYPE score = score_class_index_map[i].score; OUTPUT_INDICES_TYPE cls = score_class_index_map[i].class_idx; OUTPUT_INDICES_TYPE idx = score_class_index_map[i].box_idx; + +#ifdef USE_BLOCKED_FORMAT + INPUT_TYPE4 res = vload4(ROI_COUNT * cls + idx, refined_boxes); + + output_boxes[idx0] = res[0]; + output_boxes[idx1] = res[1]; + output_boxes[idx2] = res[2]; + output_boxes[idx3] = res[3]; + output_scores[idx_i4] = score; + output_classes[idx_i5] = cls; +#else vstore4(vload4(ROI_COUNT * cls + idx, refined_boxes), i, output_boxes); output_scores[i] = score; output_classes[i] = cls; +#endif } else { + +#ifdef USE_BLOCKED_FORMAT + output_boxes[idx0] = ZERO; + output_boxes[idx1] = ZERO; + output_boxes[idx2] = ZERO; + output_boxes[idx3] = ZERO; + output_scores[idx_i4] = ZERO; + output_classes[idx_i5] = 0; +#else vstore4(ZERO4, i, output_boxes); output_scores[i] = ZERO; output_classes[i] = 0; +#endif } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp index c74236012f0..6d5da7e8ef8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/ed_do/detection_output_kernel_ref.cpp @@ -21,10 +21,12 @@ ParamsKey ExperimentalDetectronDetectionOutputKernelRef::GetSupportedKey() const k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::INT32); k.EnableOutputDataType(Datatype::INT64); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); k.EnableBatching(); k.EnableDifferentTypes(); + k.EnableTensorPitches(); return k; } @@ -82,7 +84,9 @@ JitConstants ExperimentalDetectronDetectionOutputKernelRef::GetJitConstants( if (params.class_agnostic_box_regression) { jit.AddConstant(MakeJitConstant("CLASS_AGNOSTIC_BOX_REGRESSION", true)); } - + if (!SimpleLayout(params.inputs[0].GetLayout())) { + jit.AddConstant(MakeJitConstant("USE_BLOCKED_FORMAT", true)); + } return jit; } diff --git a/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp index 8f4fde1f11e..ac1bedcb433 100644 --- a/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_detection_output_gpu_test.cpp @@ -57,13 +57,18 @@ struct ExperimentalDetectronDetectionOutputParams { std::vector expected_scores; }; +template +using ExperimentalDetectronDetectionOutputParamsWithLayout = + std::tuple, format::type>; + template struct experimental_detectron_detection_output_test - : public ::testing::TestWithParam> { + : public ::testing::TestWithParam> { public: void test() { - const ExperimentalDetectronDetectionOutputParams param = - testing::TestWithParam>::GetParam(); + ExperimentalDetectronDetectionOutputParams param; + format::type fmt; + std::tie(param, fmt) = this->GetParam(); auto data_type = type_to_data_type::value; auto& engine = get_test_engine(); @@ -87,32 +92,42 @@ public: const auto input_im_info = engine.allocate_memory({data_type, format::bfyx, tensor{batch(1), feature(3)}}); set_values(input_im_info, param.im_info); - const primitive_id output_scores_id = "OutputScores"; - auto output_scores = - engine.allocate_memory({data_type, format::bfyx, tensor{batch(param.max_detections_per_image)}}); - - const primitive_id output_classes_id = "OutputClasses"; - auto output_classes = - engine.allocate_memory({data_types::i32, format::bfyx, tensor{batch(param.max_detections_per_image)}}); - topology topology; topology.add(input_layout(input_boxes_id, input_boxes->get_layout())); topology.add(input_layout(input_deltas_id, input_deltas->get_layout())); topology.add(input_layout(input_scores_id, input_scores->get_layout())); topology.add(input_layout(input_im_info_id, input_im_info->get_layout())); - topology.add(mutable_data(output_classes_id, output_classes)); - topology.add(mutable_data(output_scores_id, output_scores)); + + const primitive_id b_input_boxes_id = "BlockedInputBoxes"; + const primitive_id b_input_deltas_id = "BlockedInputDeltas"; + const primitive_id b_input_scores_id = "BlockedInputScores"; + const primitive_id b_input_im_info_id = "BlockedInputImInfo"; - const primitive_id eddo_id = "experimental_detectron_detection_output"; - const auto eddo_primitive = experimental_detectron_detection_output{ - eddo_id, - input_boxes_id, - input_deltas_id, - input_scores_id, - input_im_info_id, - output_classes_id, - output_scores_id, + topology.add(reorder(b_input_boxes_id, input_boxes_id, fmt, data_type)); + topology.add(reorder(b_input_deltas_id, input_deltas_id, fmt, data_type)); + topology.add(reorder(b_input_scores_id, input_scores_id, fmt, data_type)); + topology.add(reorder(b_input_im_info_id, input_im_info_id, fmt, data_type)); + + const primitive_id b_output_scores_id = "BlockedOutputScores"; + const primitive_id b_output_classes_id = "BlockedOutputClasses"; + const layout output_scores_layout = {data_type, fmt, tensor{batch(param.max_detections_per_image)}}; + auto b_output_scores = engine.allocate_memory(output_scores_layout); + const layout output_classes_layout = {data_types::i32, fmt, tensor{batch(param.max_detections_per_image)}}; + auto b_output_classes = engine.allocate_memory(output_classes_layout); + + topology.add(mutable_data(b_output_classes_id, b_output_classes)); + topology.add(mutable_data(b_output_scores_id, b_output_scores)); + + const primitive_id b_eddo_id = "blocked_experimental_detectron_detection_output"; + const auto b_eddo_primitive = experimental_detectron_detection_output{ + b_eddo_id, + b_input_boxes_id, + b_input_deltas_id, + b_input_scores_id, + b_input_im_info_id, + b_output_classes_id, + b_output_scores_id, param.score_threshold, param.nms_threshold, param.num_classes, @@ -123,7 +138,9 @@ public: param.deltas_weights, }; - topology.add(eddo_primitive); + topology.add(b_eddo_primitive); + const primitive_id eddo_id = "experimental_detectron_detection_output"; + topology.add(reorder(eddo_id, b_eddo_primitive /*b_eddo_id*/, format::bfyx, data_type)); network network(engine, topology); @@ -131,20 +148,34 @@ public: network.set_input_data(input_deltas_id, input_deltas); network.set_input_data(input_scores_id, input_scores); network.set_input_data(input_im_info_id, input_im_info); - const auto outputs = network.execute(); const auto output_boxes = outputs.at(eddo_id).get_memory(); - const cldnn::mem_lock output_boxes_ptr(output_boxes, get_test_stream()); ASSERT_EQ(output_boxes_ptr.size(), param.max_detections_per_image * 4); - const cldnn::mem_lock output_classes_ptr(output_classes, get_test_stream()); - ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image); - + const primitive_id output_scores_id = "OutputScores"; + cldnn::topology reorder_score_topology; + reorder_score_topology.add(input_layout(b_output_scores_id, output_scores_layout)); + reorder_score_topology.add(reorder(output_scores_id, b_output_scores_id, format::bfyx, data_type)); + cldnn::network reorder_score_net{engine, reorder_score_topology}; + reorder_score_net.set_input_data(b_output_scores_id, b_output_scores); + const auto score_result = reorder_score_net.execute(); + const auto output_scores = score_result.at(output_scores_id).get_memory(); const cldnn::mem_lock output_scores_ptr(output_scores, get_test_stream()); ASSERT_EQ(output_scores_ptr.size(), param.max_detections_per_image); + const primitive_id output_classes_id = "OutputClasses"; + cldnn::topology reorder_classes_topology; + reorder_classes_topology.add(input_layout(b_output_classes_id, output_classes_layout)); + reorder_classes_topology.add(reorder(output_classes_id, b_output_classes_id, format::bfyx, data_types::i32)); + cldnn::network reorder_classes_net{engine, reorder_classes_topology}; + reorder_classes_net.set_input_data(b_output_classes_id, b_output_classes); + const auto classes_result = reorder_classes_net.execute(); + const auto output_classes = classes_result.at(output_classes_id).get_memory(); + const cldnn::mem_lock output_classes_ptr(output_classes, get_test_stream()); + ASSERT_EQ(output_classes_ptr.size(), param.max_detections_per_image); + const auto& expected_boxes = param.expected_boxes; const auto& expected_classes = param.expected_classes; const auto& expected_scores = param.expected_scores; @@ -172,6 +203,14 @@ TEST_P(experimental_detectron_detection_output_test_f16, basic) { ASSERT_NO_FATAL_FAILURE(test()); } +const std::vector layouts{ + format::bfyx, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32, + format::bs_fs_yx_bsv16_fsv16, + format::bs_fs_yx_bsv32_fsv16, + format::bs_fs_yx_bsv32_fsv32}; + template std::vector> getExperimentalDetectronDetectionOutputParams() { std::vector> params = { @@ -390,8 +429,14 @@ std::vector> getExperimentalDetect INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test, experimental_detectron_detection_output_test_f32, - ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams())); + testing::Combine( + ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams()), + ::testing::ValuesIn(layouts) + )); INSTANTIATE_TEST_SUITE_P(experimental_detectron_detection_output_gpu_test, experimental_detectron_detection_output_test_f16, - ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams())); + testing::Combine( + ::testing::ValuesIn(getExperimentalDetectronDetectionOutputParams()), + ::testing::ValuesIn(layouts) + ));