From c6528ee4ea1dec39c76df18a4c92fd194636b817 Mon Sep 17 00:00:00 2001 From: Konstantin Beluchenko Date: Wed, 2 Nov 2022 16:16:57 +0200 Subject: [PATCH] [GPU] Matrix NMS (#13137) --- .../convert_matrix_nms_to_matrix_nms_ie.cpp | 5 +- .../intel_gpu/plugin/primitives_list.hpp | 1 + .../intel_gpu/primitives/matrix_nms.hpp | 154 ++++ .../src/graph/impls/ocl/matrix_nms.cpp | 117 +++ .../src/graph/impls/ocl/register.cpp | 1 + .../src/graph/impls/ocl/register.hpp | 1 + .../src/graph/include/matrix_nms_inst.h | 43 ++ .../intel_gpu/src/graph/matrix_nms.cpp | 88 +++ src/plugins/intel_gpu/src/graph/program.cpp | 3 + .../cl_kernels/matrix_nms_ref.cl | 333 +++++++++ .../src/kernel_selector/common_types.h | 1 + .../kernel_selector_common.cpp | 1 + .../matrix_nms/matrix_nms_kernel_ref.cpp | 196 +++++ .../matrix_nms/matrix_nms_kernel_ref.h | 67 ++ .../matrix_nms/matrix_nms_kernel_selector.cpp | 18 + .../matrix_nms/matrix_nms_kernel_selector.h | 21 + .../intel_gpu/src/plugin/ops/matrix_nms.cpp | 83 +++ .../src/plugin/transformations_pipeline.cpp | 2 + .../tests/test_cases/matrix_nms_gpu_test.cpp | 673 ++++++++++++++++++ .../single_layer_tests/matrix_nms.cpp | 48 ++ .../src/single_layer/matrix_nms.cpp | 5 +- 21 files changed, 1858 insertions(+), 3 deletions(-) create mode 100644 src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp create mode 100644 src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h create mode 100644 src/plugins/intel_gpu/src/graph/matrix_nms.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h create mode 100644 src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp create mode 100644 src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp create mode 100644 src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp diff --git a/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp b/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp index a84dd3886af..9f0d98f2143 100644 --- a/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp +++ b/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp @@ -14,6 +14,7 @@ #include "itt.hpp" #include "ngraph_ops/nms_static_shape_ie.hpp" +#include "transformations/utils/utils.hpp" ngraph::pass::ConvertMatrixNmsToMatrixNmsIE::ConvertMatrixNmsToMatrixNmsIE(bool force_i32_output_type) { MATCHER_SCOPE(ConvertMatrixNmsToMatrixNmsIE); @@ -46,13 +47,13 @@ ngraph::pass::ConvertMatrixNmsToMatrixNmsIE::ConvertMatrixNmsToMatrixNmsIE(bool if (nms->output(1).get_element_type() != output_1.get_element_type()) { output_1 = std::make_shared(output_1, nms->output(1).get_element_type()); - output_1.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.1"); + output_1.get_node_shared_ptr()->set_friendly_name(op::util::create_ie_output_name(nms->output(1))); new_ops.emplace_back(output_1.get_node_shared_ptr()); } if (nms->output(2).get_element_type() != output_2.get_element_type()) { output_2 = std::make_shared(output_2, nms->output(2).get_element_type()); - output_2.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.2"); + output_2.get_node_shared_ptr()->set_friendly_name(op::util::create_ie_output_name(nms->output(2))); new_ops.emplace_back(output_2.get_node_shared_ptr()); } diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index f2a7c54b292..6e1fa01dae3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -247,3 +247,4 @@ REGISTER_FACTORY(v9, Eye); // --------------------------- Supported internal ops --------------------------- // REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal); REGISTER_FACTORY(internal, GenerateProposalsIEInternal); +REGISTER_FACTORY(internal, NmsStaticShapeIE8); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp new file mode 100644 index 00000000000..d74d8a62295 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp @@ -0,0 +1,154 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include + +#include "ngraph/op/matrix_nms.hpp" +#include "primitive.hpp" + +namespace cldnn { +/// @addtogroup cpp_api C++ API +/// @{ +/// @addtogroup cpp_topology Network Topology +/// @{ +/// @addtogroup cpp_primitives Primitives +/// @{ + +/// @brief Performs matrix nms of input boxes and returns indices of selected boxes. +struct matrix_nms : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(matrix_nms) + + enum decay_function { gaussian, linear }; + + enum sort_result_type { + class_id, // sort selected boxes by class id (ascending) in each batch element + score, // sort selected boxes by score (descending) in each batch element + none // do not guarantee the order in each batch element + }; + + /// \brief Structure that specifies attributes of the operation + struct attributes { + // specifies order of output elements + sort_result_type sort_type = sort_result_type::none; + // specifies whenever it is necessary to sort selected boxes across batches or not + bool sort_result_across_batch = false; + // specifies minimum score to consider box for the processing + float score_threshold = 0.0f; + // specifies maximum number of boxes to be selected per class, -1 meaning to + // keep all boxes + int nms_top_k = -1; + // specifies maximum number of boxes to be selected per batch element, -1 + // meaning to keep all boxes + int keep_top_k = -1; + // specifies the background class id, -1 meaning to keep all classes + int background_class = -1; + // specifies decay function used to decay scores + decay_function decay = decay_function::linear; + // specifies gaussian_sigma parameter for gaussian decay_function + float gaussian_sigma = 2.0f; + // specifies threshold to filter out boxes with low confidence score after + // decaying + float post_threshold = 0.0f; + // specifies whether boxes are normalized or not + bool normalized = true; + + attributes() {} + + attributes(const ngraph::op::v8::MatrixNms::Attributes& attrs) + : attributes(from(attrs.sort_result_type), + attrs.sort_result_across_batch, + attrs.score_threshold, + attrs.nms_top_k, + attrs.keep_top_k, + attrs.background_class, + from(attrs.decay_function), + attrs.gaussian_sigma, + attrs.post_threshold, + attrs.normalized) {} + + attributes(sort_result_type sort_type, + bool sort_result_across_batch, + float score_threshold, + int nms_top_k, + int keep_top_k, + int background_class, + decay_function decay, + float gaussian_sigma, + float post_threshold, + bool normalized) + : sort_type(sort_type), + sort_result_across_batch(sort_result_across_batch), + score_threshold(score_threshold), + nms_top_k(nms_top_k), + keep_top_k(keep_top_k), + background_class(background_class), + decay(decay), + gaussian_sigma(gaussian_sigma), + post_threshold(post_threshold), + normalized(normalized) {} + }; + + /// @brief Constructs matrix_nms primitive. + /// @param id This primitive id. + /// @param boxes primitive id. + /// @param scores primitive id. + /// @param second_output primitive id. + /// @param third_output primitive id. + /// @param attrs attributes. + matrix_nms(const primitive_id& id, + const primitive_id& boxes, + const primitive_id& scores, + const primitive_id& second_output, + const primitive_id& third_output, + const matrix_nms::attributes& attrs) + : primitive_base(id, {boxes, scores, second_output, third_output}), + attribs(attrs) {} + + /// @brief Constructs matrix_nms primitive. + /// @param id This primitive id. + /// @param boxes primitive id. + /// @param scores primitive id. + /// @param second_output primitive id. + /// @param third_output primitive id. + /// @param attrs operation attributes. + matrix_nms(const primitive_id& id, + const primitive_id& boxes, + const primitive_id& scores, + const primitive_id& second_output, + const primitive_id& third_output, + const ngraph::op::v8::MatrixNms::Attributes& attrs) + : primitive_base(id, {boxes, scores, second_output, third_output}), + attribs(attrs) {} + + attributes attribs; + +private: + static cldnn::matrix_nms::decay_function from(ngraph::op::v8::MatrixNms::DecayFunction decay) { + switch (decay) { + case ngraph::op::v8::MatrixNms::DecayFunction::GAUSSIAN: + return cldnn::matrix_nms::decay_function::gaussian; + case ngraph::op::v8::MatrixNms::DecayFunction::LINEAR: + default: + return cldnn::matrix_nms::decay_function::linear; + } + } + + static cldnn::matrix_nms::sort_result_type from(ngraph::op::v8::MatrixNms::SortResultType type) { + switch (type) { + case ngraph::op::v8::MatrixNms::SortResultType::CLASSID: + return cldnn::matrix_nms::sort_result_type::class_id; + case ngraph::op::v8::MatrixNms::SortResultType::SCORE: + return cldnn::matrix_nms::sort_result_type::score; + case ngraph::op::v8::MatrixNms::SortResultType::NONE: + default: + return cldnn::matrix_nms::sort_result_type::none; + } + } +}; +/// @} +/// @} +/// @} +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp new file mode 100644 index 00000000000..23d1b42aa4a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "data_inst.h" +#include "impls/implementation_map.hpp" +#include "kernel_selector_helper.h" +#include "matrix_nms/matrix_nms_kernel_ref.h" +#include "matrix_nms/matrix_nms_kernel_selector.h" +#include "matrix_nms_inst.h" +#include "primitive_base.hpp" + +namespace cldnn { +namespace ocl { + +namespace { +kernel_selector::matrix_nms_params::decay_function from(matrix_nms::decay_function decay) { + switch (decay) { + case matrix_nms::decay_function::gaussian: + return kernel_selector::matrix_nms_params::decay_function::GAUSSIAN; + default: + case matrix_nms::decay_function::linear: + return kernel_selector::matrix_nms_params::decay_function::LINEAR; + } +} + +kernel_selector::matrix_nms_params::sort_result_type from(matrix_nms::sort_result_type type) { + switch (type) { + case matrix_nms::sort_result_type::class_id: + return kernel_selector::matrix_nms_params::sort_result_type::CLASS_ID; + case matrix_nms::sort_result_type::score: + return kernel_selector::matrix_nms_params::sort_result_type::SCORE; + default: + case matrix_nms::sort_result_type::none: + return kernel_selector::matrix_nms_params::sort_result_type::NONE; + } +} +} // namespace + +struct matrix_nms_impl : typed_primitive_impl_ocl { + using parent = typed_primitive_impl_ocl; + using parent::parent; + + std::unique_ptr clone() const override { + return make_unique(*this); + } + +protected: + kernel_arguments_data get_arguments(matrix_nms_inst& instance, int32_t) const override { + kernel_arguments_data args; + args.inputs = {instance.input_boxes_mem(), + instance.input_scores_mem(), + instance.input_selected_boxes_mem(), + instance.input_valid_outputs_mem()}; + args.outputs = {instance.output_memory_ptr()}; + + return args; + } + +public: + static primitive_impl* create(const matrix_nms_node& node, const kernel_impl_params& impl_param) { + auto params = get_default_params(impl_param); + auto optional_params = + get_default_optional_params(node.get_program()); + + const auto& scores_layout = impl_param.get_input_layout(1); + const auto& second_output_layout = impl_param.get_input_layout(2); + const auto& third_output_layout = impl_param.get_input_layout(3); + + params.inputs.push_back(convert_data_tensor(scores_layout)); + params.inputs.push_back(convert_data_tensor(second_output_layout)); + params.inputs.push_back(convert_data_tensor(third_output_layout)); + + const auto& primitive = node.get_primitive(); + params.sort_type = from(primitive->attribs.sort_type); + params.sort_result_across_batch = primitive->attribs.sort_result_across_batch; + params.score_threshold = primitive->attribs.score_threshold; + params.nms_top_k = primitive->attribs.nms_top_k; + params.keep_top_k = primitive->attribs.keep_top_k; + params.background_class = primitive->attribs.background_class; + params.decay = from(primitive->attribs.decay); + params.gaussian_sigma = primitive->attribs.gaussian_sigma; + params.post_threshold = primitive->attribs.post_threshold; + params.normalized = primitive->attribs.normalized; + + auto& kernel_selector = kernel_selector::matrix_nms_kernel_selector::Instance(); + auto best_kernels = kernel_selector.GetBestKernels(params, optional_params); + + CLDNN_ERROR_BOOL(node.id(), + "Best_kernel.empty()", + best_kernels.empty(), + "Cannot find a proper kernel with this nodeuments"); + + auto matrix_nms_node = new matrix_nms_impl(node, best_kernels[0]); + + return matrix_nms_node; + } +}; + +namespace detail { + +attach_matrix_nms_impl::attach_matrix_nms_impl() { + auto types = {data_types::f16, data_types::f32, data_types::i32}; + + auto formats = {format::bfyx, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32, + format::bs_fs_yx_bsv16_fsv16, + format::bs_fs_yx_bsv32_fsv16, + format::bs_fs_yx_bsv32_fsv32}; + + implementation_map::add(impl_types::ocl, matrix_nms_impl::create, types, formats); +} + +} // namespace detail +} // namespace ocl +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index 06b5fd780b1..7061b8e284d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -50,6 +50,7 @@ void register_implementations() { REGISTER_OCL(mutable_data); REGISTER_OCL(mvn); REGISTER_OCL(non_max_suppression); + REGISTER_OCL(matrix_nms); REGISTER_OCL(normalize); REGISTER_OCL(one_hot); REGISTER_OCL(permute); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index ca6372ce61a..98f5cab6380 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -131,6 +131,7 @@ REGISTER_OCL(max_unpooling); REGISTER_OCL(mutable_data); REGISTER_OCL(mvn); REGISTER_OCL(non_max_suppression); +REGISTER_OCL(matrix_nms); REGISTER_OCL(normalize); REGISTER_OCL(one_hot); REGISTER_OCL(permute); diff --git a/src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h b/src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h new file mode 100644 index 00000000000..15c400e282a --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#pragma once +#include +#include + +#include "intel_gpu/primitives/matrix_nms.hpp" +#include "primitive_inst.h" + +namespace cldnn { + +using matrix_nms_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { + using parent = typed_primitive_inst_base; + +public: + typed_primitive_inst(network& network, const matrix_nms_node& node) : parent(network, node) {} + + static layout calc_output_layout(const matrix_nms_node& node, const kernel_impl_params& impl_param); + static std::string to_string(const matrix_nms_node& node); + + memory::ptr input_boxes_mem() const { + return dep_memory_ptr(0); + } + memory::ptr input_scores_mem() const { + return dep_memory_ptr(1); + } + memory::ptr input_selected_boxes_mem() const { + return dep_memory_ptr(2); + } + memory::ptr input_valid_outputs_mem() const { + return dep_memory_ptr(3); + } +}; + +using matrix_nms_inst = typed_primitive_inst; + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/matrix_nms.cpp b/src/plugins/intel_gpu/src/graph/matrix_nms.cpp new file mode 100644 index 00000000000..e7154e28d33 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/matrix_nms.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include + +#include "matrix_nms_inst.h" +#include "openvino/core/enum_names.hpp" +#include "primitive_type_base.h" + +namespace cldnn { +primitive_type_id matrix_nms::type_id() { + static primitive_type_base instance; + return &instance; +} + +layout matrix_nms_inst::calc_output_layout(const matrix_nms_node& node, const kernel_impl_params& impl_param) { + const auto primitive = impl_param.typed_desc(); + const auto boxes_layout = impl_param.get_input_layout(0); + const auto scores_layout = impl_param.get_input_layout(1); + + const auto batches_num = boxes_layout.batch(); + auto classes_num = scores_layout.feature(); + const auto boxes_num = boxes_layout.feature(); + + if (primitive->attribs.background_class >= 0 && primitive->attribs.background_class < classes_num) + classes_num = std::max(1, classes_num - 1); + + int max_output_boxes_per_class{boxes_num}; + if (primitive->attribs.nms_top_k >= 0) + max_output_boxes_per_class = std::min(max_output_boxes_per_class, primitive->attribs.nms_top_k); + + auto max_output_boxes_per_batch = max_output_boxes_per_class * classes_num; + if (primitive->attribs.keep_top_k >= 0) + max_output_boxes_per_batch = std::min(max_output_boxes_per_batch, primitive->attribs.keep_top_k); + + auto output_num = max_output_boxes_per_batch * batches_num; + + // BOX_DATA: class_id, box_score, xmin, ymin, xmax, ymax + constexpr size_t BOX_DATA{6}; + return layout(boxes_layout.data_type, boxes_layout.format, {output_num, BOX_DATA, 1, 1}); +} + +std::string matrix_nms_inst::to_string(const matrix_nms_node& node) { + json_composite matrix_nms_info; + matrix_nms_info.add("boxes id", node.input().id()); + matrix_nms_info.add("scores id", node.get_dependency(1).id()); + matrix_nms_info.add("sort_result_type", ov::as_string(node.get_primitive()->attribs.sort_type)); + matrix_nms_info.add("decay_function", ov::as_string(node.get_primitive()->attribs.decay)); + matrix_nms_info.add("sort_result_across_batch", node.get_primitive()->attribs.sort_result_across_batch); + matrix_nms_info.add("score_threshold", node.get_primitive()->attribs.score_threshold); + matrix_nms_info.add("nms_top_k", node.get_primitive()->attribs.nms_top_k); + matrix_nms_info.add("keep_top_k", node.get_primitive()->attribs.keep_top_k); + matrix_nms_info.add("background_class", node.get_primitive()->attribs.background_class); + matrix_nms_info.add("gaussian_sigma", node.get_primitive()->attribs.gaussian_sigma); + matrix_nms_info.add("post_threshold", node.get_primitive()->attribs.post_threshold); + matrix_nms_info.add("normalized", node.get_primitive()->attribs.normalized); + + auto node_info = node.desc_to_json(); + node_info->add("matrix_nms info", matrix_nms_info); + std::stringstream primitive_description; + node_info->dump(primitive_description); + return primitive_description.str(); +} + +} // namespace cldnn + +namespace ov { +template <> +EnumNames& EnumNames::get() { + static auto enum_names = EnumNames( + "decay_function", + {{"gaussian", matrix_nms::decay_function::gaussian}, {"linear", matrix_nms::decay_function::linear}}); + return enum_names; +} + +template <> +EnumNames& EnumNames::get() { + static auto enum_names = + EnumNames("sort_result_type", + {{"class_id", matrix_nms::sort_result_type::class_id}, + {"score", matrix_nms::sort_result_type::score}, + {"none", matrix_nms::sort_result_type::none}}); + return enum_names; +} +} // namespace ov diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index f706c00f26d..96cca47fc09 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -20,6 +20,7 @@ #include "sliding_window_utils.hpp" #include "program_helpers.h" +#include "matrix_nms_inst.h" #include "roi_pooling_inst.h" #include "reorg_yolo_inst.h" #include "eltwise_inst.h" @@ -1442,6 +1443,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::ctc_loss::type_id() && prim.type() != cldnn::non_max_suppression::type_id() && prim.type() != cldnn::roi_align::type_id() && + prim.type() != cldnn::matrix_nms::type_id() && prim.type() != cldnn::adaptive_pooling::type_id() && prim.type() != cldnn::bucketize::type_id() && prim.type() != cldnn::roll::type_id() && @@ -1487,6 +1489,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::ctc_loss::type_id() && prim.type() != cldnn::non_max_suppression::type_id() && prim.type() != cldnn::roi_align::type_id() && + prim.type() != cldnn::matrix_nms::type_id() && prim.type() != cldnn::adaptive_pooling::type_id() && prim.type() != cldnn::bucketize::type_id() && prim.type() != cldnn::roll::type_id() && diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl new file mode 100644 index 00000000000..9f186d3b036 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl @@ -0,0 +1,333 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/common.cl" +#include "include/batch_headers/data_types.cl" + +#define NUM_BATCHES INPUT0_BATCH_NUM +#define NUM_BOXES INPUT0_FEATURE_NUM +#define NUM_CLASSES INPUT1_FEATURE_NUM + +typedef struct { + int batch_idx; + int class_idx; + int box_idx; + INPUT1_TYPE score; +} FUNC(BoxInfo); + +#define BOX_INFO FUNC(BoxInfo) + +#define unroll_for __attribute__((opencl_unroll_hint)) for + +inline INPUT1_TYPE FUNC(decay_gaussian)(INPUT1_TYPE iou, INPUT1_TYPE max_iou) { + return exp((max_iou * max_iou - iou * iou) * GAUSSIAN_SIGMA); +} + +inline INPUT1_TYPE FUNC(decay_linear)(INPUT1_TYPE iou, INPUT1_TYPE max_iou) { + return (INPUT1_VAL_ONE - iou) / (INPUT1_VAL_ONE - max_iou + TINY); +} + +inline void FUNC(swap)(int* a, int* b) { + int temp = *a; + *a = *b; + *b = temp; +} + +inline void FUNC(sortIterative)(const __global INPUT1_TYPE* scores, + const int batchId, + const int classId, + int* indices, + const int size) { + for (int i = 1; i <= size; i++) { + bool swapped = false; + for (int j = 0; j < size - i; j++) { + const INPUT1_TYPE score_curr = scores[INPUT1_GET_INDEX(batchId, classId, 0, indices[j])]; + const INPUT1_TYPE score_next = scores[INPUT1_GET_INDEX(batchId, classId, 0, indices[j + 1])]; + if (score_curr < score_next) { + FUNC_CALL(swap)(&indices[j], &indices[j + 1]); + swapped = true; + } + } + + if (!swapped) + break; + } +} + +inline void FUNC(swap_boxes)(__global BOX_INFO* a, __global BOX_INFO* b) { + BOX_INFO temp = *a; + *a = *b; + *b = temp; +} + +inline void FUNC(sortIterativeBoxes)(__global BOX_INFO* boxes, int l, int h) { + for (int i = 1; i < h - l; i++) { + bool swapped = false; + for (int j = l; j < h - i; j++) { + if ((boxes[j].score < boxes[j + 1].score) || + (boxes[j].score == boxes[j + 1].score && boxes[j].class_idx > boxes[j + 1].class_idx) || + (boxes[j].score == boxes[j + 1].score && boxes[j].class_idx == boxes[j + 1].class_idx && + boxes[j].box_idx > boxes[j + 1].box_idx)) { + FUNC_CALL(swap_boxes)(&boxes[j], &boxes[j + 1]); + swapped = true; + } + } + + if (!swapped) + break; + } +} + +inline void FUNC(sortIterativeBoxesAcrossBatches)(__global BOX_INFO* boxes) { + const int size = NUM_BATCHES * NUM_CLASSES * MAX_BOXES_PER_CLASS; + for (int i = 1; i < size; i++) { + bool swapped = false; + for (int j = 0; j < size - i; j++) { + __global BOX_INFO* l = boxes + j; + __global BOX_INFO* r = boxes + j + 1; +// sort by score +#if SORT_TYPE == 1 + if ((l->score < r->score) || (l->score == r->score && l->batch_idx > r->batch_idx) || + (l->score == r->score && l->batch_idx == r->batch_idx && l->class_idx > r->class_idx) || + (l->score == r->score && l->batch_idx == r->batch_idx && l->class_idx == r->class_idx && + l->box_idx > r->box_idx)) { + FUNC_CALL(swap_boxes)(l, r); + swapped = true; + } +// sort by class id +#elif SORT_TYPE == 0 + if (r->score != INPUT1_VAL_ZERO && + ((l->score == INPUT1_VAL_ZERO) || // case with empty buffer + (l->class_idx > r->class_idx) || (l->class_idx == r->class_idx && l->batch_idx > r->batch_idx) || + (l->class_idx == r->class_idx && l->batch_idx == r->batch_idx && l->score < r->score) || + (l->class_idx == r->class_idx && l->batch_idx == r->batch_idx && l->score == r->score && + l->box_idx > r->box_idx))) { + FUNC_CALL(swap_boxes)(l, r); + swapped = true; + } +#endif + } + + if (!swapped) + break; + } +} + +inline COORD_TYPE_4 FUNC(getBoxCoords)(const __global INPUT0_TYPE* boxes, const short batch, const ushort box_idx) { + COORD_TYPE_4 coords = (COORD_TYPE_4)(boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 0)], + boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 1)], + boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 2)], + boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 3)]); + + // uncomment when flipped coordinates will be fixed in reference impl + /* + const INPUT0_TYPE x1 = min(coords[0], coords[2]); + const INPUT0_TYPE x2 = max(coords[0], coords[2]); + const INPUT0_TYPE y1 = min(coords[1], coords[3]); + const INPUT0_TYPE y2 = max(coords[1], coords[3]); + coords[0] = x1; + coords[1] = y1; + coords[2] = x2; + coords[3] = y2; + */ + + return coords; +} + +inline INPUT0_TYPE FUNC(area)(const INPUT0_TYPE w, const INPUT0_TYPE h) { + return (w + NORM) * (h + NORM); +} + +inline INPUT0_TYPE FUNC(areaBox)(const COORD_TYPE_4 box) { + if (box[2] < box[0] || box[3] < box[1]) + return INPUT0_VAL_ZERO; + return FUNC_CALL(area)(box[3] - box[1], box[2] - box[0]); +} + +inline INPUT0_TYPE FUNC(intersectionOverUnion)(const COORD_TYPE_4 box1, const COORD_TYPE_4 box2) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || box2[3] < box1[1]) + return INPUT0_VAL_ZERO; + + const INPUT0_TYPE area = FUNC_CALL(areaBox)(box1); + const INPUT0_TYPE areaBox = FUNC_CALL(areaBox)(box2); + + const INPUT0_TYPE intersection_xmin = max(box1[0], box2[0]); + const INPUT0_TYPE intersection_ymin = max(box1[1], box2[1]); + const INPUT0_TYPE intersection_xmax = min(box1[2], box2[2]); + const INPUT0_TYPE intersection_ymax = min(box1[3], box2[3]); + + const INPUT0_TYPE intersection_area = + FUNC_CALL(area)(intersection_xmax - intersection_xmin, intersection_ymax - intersection_ymin); + const INPUT0_TYPE union_area = area + areaBox - intersection_area; + + return intersection_area / union_area; +} + +#ifdef MATRIX_NMS_STAGE_0 +KERNEL(matrix_nms_ref_stage_0) +(const __global INPUT0_TYPE* input_boxes, + const __global INPUT1_TYPE* input_scores, + __global uchar* buffer0, + __global int* selected_boxes_num) { + const int batchId = get_global_id(0); + const int classId = get_global_id(1); + + if (classId == BACKGROUND_CLASS) + return; + + int sorted_score_indices[NUM_BOXES]; + + for (int i = 0; i < NUM_BOXES; ++i) + sorted_score_indices[i] = i; + + int valid_boxes_num = 0; + for (int i = 0; i < NUM_BOXES; i++) { + if (input_scores[INPUT1_GET_INDEX(batchId, classId, 0, i)] > SCORE_THRESHOLD) + ++valid_boxes_num; + } + + // TODO: consider faster sorting algorithm + FUNC_CALL(sortIterative)(input_scores, batchId, classId, sorted_score_indices, NUM_BOXES); + + valid_boxes_num = min(valid_boxes_num, MAX_BOXES_PER_CLASS); + + const int matrix_size = MAX_BOXES_PER_CLASS < 3 ? 1 : (MAX_BOXES_PER_CLASS * (MAX_BOXES_PER_CLASS - 1)) >> 1; + INPUT1_TYPE iou_matrix[matrix_size]; + INPUT1_TYPE iou_max[MAX_BOXES_PER_CLASS]; + + iou_max[0] = INPUT1_VAL_ZERO; + for (int i = 1; i < valid_boxes_num; ++i) { + INPUT1_TYPE max_iou = INPUT1_VAL_ZERO; + const COORD_TYPE_4 box_i = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[i]); + for (int j = 0; j < i; ++j) { + const COORD_TYPE_4 box_j = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[j]); + const INPUT1_TYPE iou = FUNC_CALL(intersectionOverUnion)(box_i, box_j); + + max_iou = max(iou, max_iou); + iou_matrix[i * (i - 1) / 2 + j] = iou; + } + iou_max[i] = max_iou; + } + + const INPUT1_TYPE first_score = input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[0])]; + + __global BOX_INFO* box_info = (__global BOX_INFO*)buffer0; + box_info = &box_info[batchId * NUM_CLASSES * MAX_BOXES_PER_CLASS + classId * MAX_BOXES_PER_CLASS]; + + int box_info_counter = 0; + if (first_score > POST_THRESHOLD && valid_boxes_num > 0) { + box_info[box_info_counter].class_idx = classId; + box_info[box_info_counter].score = first_score; + box_info[box_info_counter].box_idx = sorted_score_indices[0]; + box_info[box_info_counter].batch_idx = batchId; + ++box_info_counter; + } + + for (int i = 1; i < valid_boxes_num; ++i) { + INPUT1_TYPE min_decay = INPUT1_VAL_ONE; + for (int j = 0; j < i; ++j) { + INPUT1_TYPE iou = iou_matrix[i * (i - 1) / 2 + j]; + INPUT1_TYPE decay = + DECAY_FUNC == 0 ? FUNC_CALL(decay_gaussian)(iou, iou_max[j]) : FUNC_CALL(decay_linear)(iou, iou_max[j]); + min_decay = min(min_decay, decay); + } + + INPUT1_TYPE ds = min_decay * input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[i])]; + + if (ds <= POST_THRESHOLD) + continue; + + box_info[box_info_counter].batch_idx = batchId; + box_info[box_info_counter].class_idx = classId; + box_info[box_info_counter].box_idx = sorted_score_indices[i]; + box_info[box_info_counter].score = ds; + ++box_info_counter; + } + + selected_boxes_num[batchId * NUM_CLASSES + classId] = box_info_counter; +} +#endif /* MATRIX_NMS_STAGE_0 */ + +#ifdef MATRIX_NMS_STAGE_1 +KERNEL(matrix_nms_ref_stage_1) +(__global INPUT3_TYPE* valid_outputs, __global uchar* buffer0, __global int* selected_boxes_num) { + const int batchId = get_global_id(0); + + __global BOX_INFO* box_info = (__global BOX_INFO*)buffer0; + + const int first_idx = batchId * NUM_CLASSES * MAX_BOXES_PER_CLASS; + const int last_idx = first_idx + NUM_CLASSES * MAX_BOXES_PER_CLASS; + + // TODO: consider faster sorting algorithm + FUNC_CALL(sortIterativeBoxes)(box_info, first_idx, last_idx); + + for (int i = 0; i < NUM_CLASSES; ++i) { + if (i == BACKGROUND_CLASS) + continue; + + valid_outputs[INPUT3_GET_INDEX(batchId, 0, 0, 0)] += selected_boxes_num[batchId * NUM_CLASSES + i]; + } +} +#endif /* MATRIX_NMS_STAGE_1 */ + +#ifdef MATRIX_NMS_STAGE_2 +KERNEL(matrix_nms_ref_stage_2) +(const __global INPUT0_TYPE* input_boxes, + __global OUTPUT_TYPE* output, + __global INPUT2_TYPE* selected_indices, + __global INPUT3_TYPE* valid_outputs, + __global uchar* buffer0) { + __global BOX_INFO* box_info = (__global BOX_INFO*)buffer0; + + // TODO: consider faster sorting algorithm + // and index sorting instead of data sorting +#if SORT_RESULT_ACROSS_BATCH == 1 && SORT_TYPE != 2 + FUNC_CALL(sortIterativeBoxesAcrossBatches)(box_info); +#endif + + int output_idx = 0; + int box_info_idx = 0; + for (int i = 0; i < NUM_BATCHES; ++i) { + if (KEEP_TOP_K != -1 && KEEP_TOP_K < valid_outputs[INPUT3_GET_INDEX(i, 0, 0, 0)]) + valid_outputs[INPUT3_GET_INDEX(i, 0, 0, 0)] = KEEP_TOP_K; + +#if SORT_RESULT_ACROSS_BATCH == 0 + box_info_idx = i * NUM_CLASSES * MAX_BOXES_PER_CLASS; +#endif + + unroll_for(int j = 0; j < valid_outputs[INPUT3_GET_INDEX(i, 0, 0, 0)]; ++j) { + output[OUTPUT_GET_INDEX(output_idx, 0, 0, 0)] = box_info[box_info_idx].class_idx; + output[OUTPUT_GET_INDEX(output_idx, 1, 0, 0)] = box_info[box_info_idx].score; + output[OUTPUT_GET_INDEX(output_idx, 2, 0, 0)] = + input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 0)]; + output[OUTPUT_GET_INDEX(output_idx, 3, 0, 0)] = + input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 1)]; + output[OUTPUT_GET_INDEX(output_idx, 4, 0, 0)] = + input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 2)]; + output[OUTPUT_GET_INDEX(output_idx, 5, 0, 0)] = + input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 3)]; + + selected_indices[INPUT2_GET_INDEX(output_idx, 0, 0, 0)] = + box_info[box_info_idx].batch_idx * NUM_BOXES + box_info[box_info_idx].box_idx; + + ++output_idx; + ++box_info_idx; + } + + // Paddings + while (output_idx < (i + 1) * MAX_BOXES_PER_BATCH) { + unroll_for(int j = 0; j < 6; ++j) { + output[OUTPUT_GET_INDEX(output_idx, j, 0, 0)] = -OUTPUT_VAL_ONE; + } + selected_indices[INPUT2_GET_INDEX(output_idx, 0, 0, 0)] = -INPUT2_VAL_ONE; + ++output_idx; + } + } +} +#endif /* MATRIX_NMS_STAGE_2 */ + +#undef NUM_BATCHES +#undef NUM_BOXES +#undef NUM_CLASSES +#undef BOX_INFO diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index 6287d834264..0a76ed0ed16 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -90,6 +90,7 @@ enum class KernelType { CONVERT_COLOR, RANDOM_UNIFORM, ADAPTIVE_POOLING, + MATRIX_NMS, REVERSE, PRIOR_BOX, EYE, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp index 8632e0a1db7..170200261da 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp @@ -175,6 +175,7 @@ std::string toString(KernelType kt) { return "BINARY_CONVOLUTION"; case KernelType::NON_MAX_SUPPRESSION: return "NON_MAX_SUPPRESSION"; + case KernelType::MATRIX_NMS: return "MATRIX_NMS"; default: return ""; } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp new file mode 100644 index 00000000000..df45a8b815a --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp @@ -0,0 +1,196 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "matrix_nms_kernel_ref.h" + +#include + +#include "kernel_selector_utils.h" + +namespace kernel_selector { + +ParamsKey MatrixNmsKernelRef::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT32); + + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + return k; +} + +namespace { + +MatrixNmsKernelRef::DispatchData SetDefault(const matrix_nms_params& params, size_t idx) { + MatrixNmsKernelRef::DispatchData dispatch_data; + + const auto& input_scores = params.inputs[1]; + if (idx == 0) { + dispatch_data.gws = {input_scores.Batch().v, input_scores.Feature().v, 1}; + dispatch_data.lws = GetOptimalLocalWorkGroupSizes(dispatch_data.gws, params.engineInfo); + } else if (idx == 1) { + dispatch_data.gws = {input_scores.Batch().v, 1, 1}; + dispatch_data.lws = GetOptimalLocalWorkGroupSizes(dispatch_data.gws, params.engineInfo); + } else { + dispatch_data.gws = {1, 1, 1}; + dispatch_data.lws = {1, 1, 1}; + } + + return dispatch_data; +} + +std::tuple GetMaxBoxes(const matrix_nms_params& params) { + const int classes_num = params.inputs[1].Feature().v; + const int boxes_num = params.inputs[0].Feature().v; + + int max_boxes_per_class{boxes_num}; + if (params.nms_top_k >= 0) + max_boxes_per_class = std::min(max_boxes_per_class, params.nms_top_k); + + auto classes_num_adj = classes_num; + if (params.background_class >= 0 && params.background_class < classes_num) + classes_num_adj = std::max(1, classes_num - 1); + + auto max_boxes_per_batch = max_boxes_per_class * classes_num_adj; + if (params.keep_top_k >= 0) + max_boxes_per_batch = std::min(max_boxes_per_batch, params.keep_top_k); + + return {max_boxes_per_class, max_boxes_per_batch}; +} +} // anonymous namespace + +KernelsData MatrixNmsKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { + if (!Validate(params, options)) { + return {}; + } + + constexpr size_t kernels_num{3}; + KernelData kernel_data = KernelData::Default(params, kernels_num); + const matrix_nms_params& new_params = dynamic_cast(*kernel_data.params.get()); + + constexpr size_t BOX_INFO_SIZE{16}; + + const int batches_num = new_params.inputs[1].Batch().v; + const int classes_num = new_params.inputs[1].Feature().v; + + int max_boxes_per_class, max_boxes_per_batch; + std::tie(max_boxes_per_class, max_boxes_per_batch) = GetMaxBoxes(new_params); + + const size_t box_info_num = batches_num * classes_num * max_boxes_per_class; + + const size_t box_info_buffer_size = box_info_num * BOX_INFO_SIZE; + const size_t sel_boxes_num_buffer_size = batches_num * classes_num * sizeof(int); + + kernel_data.internalBufferSizes.push_back(box_info_buffer_size); + kernel_data.internalBufferSizes.push_back(sel_boxes_num_buffer_size); + kernel_data.internalBufferDataType = Datatype::F32; + + for (size_t i{}; i < kernels_num; ++i) { + auto entry_point = GetEntryPoint(kernelName, new_params.layerID, params, options, i); + auto jit_constants = GetJitConstants(new_params); + jit_constants.AddConstant(MakeJitConstant("MATRIX_NMS_STAGE_" + std::to_string(i), "true")); + + jit_constants.AddConstant(MakeJitConstant("MAX_BOXES_PER_CLASS", max_boxes_per_class)); + jit_constants.AddConstant(MakeJitConstant("MAX_BOXES_PER_BATCH", max_boxes_per_batch)); + auto jit = CreateJit(kernelName, jit_constants, entry_point); + + DispatchData dispatch_data = SetDefault(new_params, i); + auto& kernel = kernel_data.kernels[i]; + KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize); + kernel.params.workGroups.global = dispatch_data.gws; + kernel.params.workGroups.local = dispatch_data.lws; + kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo); + + SetKernelArguments(new_params, kernel, i); + } + + return {kernel_data}; +} + +float MatrixNmsKernelRef::GetKernelsPriority(const Params& params, const optional_params& options) const { + return FORCE_PRIORITY_9; +} + +bool MatrixNmsKernelRef::Validate(const Params& p, const optional_params& o) const { + if (p.GetType() != KernelType::MATRIX_NMS || o.GetType() != KernelType::MATRIX_NMS) { + return false; + } + + const matrix_nms_params& params = static_cast(p); + // inputs: boxes, scores, second output, third output + if (params.inputs.size() != 4) + return false; + + return true; +} + +JitConstants MatrixNmsKernelRef::GetJitConstants(const matrix_nms_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const auto& boxes = params.inputs[0]; + switch (boxes.GetDType()) { + case Datatype::F32: + jit.AddConstant(MakeJitConstant("COORD_TYPE_4", "float4")); + jit.AddConstant(MakeJitConstant("TINY", "1e-10f")); + break; + + case Datatype::F16: + jit.AddConstant(MakeJitConstant("COORD_TYPE_4", "half4")); + jit.AddConstant(MakeJitConstant("TINY", "1e-7h")); + break; + break; + + default: + throw std::invalid_argument("Matrix NMS boxes type should be one of F32 or F16."); + } + + jit.AddConstant(MakeJitConstant("SORT_TYPE", params.sort_type)); + jit.AddConstant(MakeJitConstant("SORT_RESULT_ACROSS_BATCH", params.sort_result_across_batch)); + jit.AddConstant(MakeJitConstant("SCORE_THRESHOLD", params.score_threshold)); + jit.AddConstant(MakeJitConstant("KEEP_TOP_K", params.keep_top_k)); + jit.AddConstant(MakeJitConstant("BACKGROUND_CLASS", params.background_class)); + jit.AddConstant(MakeJitConstant("DECAY_FUNC", params.decay)); + jit.AddConstant(MakeJitConstant("GAUSSIAN_SIGMA", params.gaussian_sigma)); + jit.AddConstant(MakeJitConstant("POST_THRESHOLD", params.post_threshold)); + jit.AddConstant(MakeJitConstant("NORM", params.normalized ? "INPUT0_VAL_ZERO" : "INPUT0_VAL_ONE")); + return jit; +} + +void MatrixNmsKernelRef::SetKernelArguments(const matrix_nms_params& params, clKernelData& kernel, size_t idx) const { + switch (idx) { + case 0: + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + break; + + case 1: + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); + break; + + case 2: + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + break; + + default: + throw std::invalid_argument("Matrix NMS has 3 kernels. valid index is 0 ~ 2."); + } +} + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h new file mode 100644 index 00000000000..236ac5f6ffe --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h @@ -0,0 +1,67 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "kernel_selector_params.h" + +namespace kernel_selector { + +struct matrix_nms_params : public base_params { + matrix_nms_params() : base_params(KernelType::MATRIX_NMS) {} + + enum decay_function { GAUSSIAN, LINEAR }; + + enum sort_result_type { + CLASS_ID, // sort selected boxes by class id (ascending) in each batch element + SCORE, // sort selected boxes by score (descending) in each batch element + NONE // do not guarantee the order in each batch element + }; + + // specifies order of output elements + sort_result_type sort_type = sort_result_type::NONE; + // specifies whenever it is necessary to sort selected boxes across batches or not + bool sort_result_across_batch = false; + // specifies minimum score to consider box for the processing + float score_threshold = 0.0f; + // specifies maximum number of boxes to be selected per class, -1 meaning to + // keep all boxes + int nms_top_k = -1; + // specifies maximum number of boxes to be selected per batch element, -1 + // meaning to keep all boxes + int keep_top_k = -1; + // specifies the background class id, -1 meaning to keep all classes + int background_class = -1; + // specifies decay function used to decay scores + decay_function decay = decay_function::LINEAR; + // specifies gaussian_sigma parameter for gaussian decay_function + float gaussian_sigma = 2.0f; + // specifies threshold to filter out boxes with low confidence score after + // decaying + float post_threshold = 0.0f; + // specifies whether boxes are normalized or not + bool normalized = true; +}; + +struct matrix_nms_optional_params : optional_params { + matrix_nms_optional_params() : optional_params(KernelType::MATRIX_NMS) {} +}; + +class MatrixNmsKernelRef : public KernelBaseOpenCL { +public: + MatrixNmsKernelRef() : KernelBaseOpenCL("matrix_nms_ref") {} + + using DispatchData = CommonDispatchData; + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + +protected: + JitConstants GetJitConstants(const matrix_nms_params& params) const; + bool Validate(const Params& p, const optional_params& o) const override; + void SetKernelArguments(const matrix_nms_params& params, clKernelData& kernel, size_t idx) const; +}; + +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp new file mode 100644 index 00000000000..b3adceb017a --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "matrix_nms_kernel_selector.h" + +#include "matrix_nms_kernel_ref.h" + +namespace kernel_selector { + +matrix_nms_kernel_selector::matrix_nms_kernel_selector() { + Attach(); +} + +KernelsData matrix_nms_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { + return GetNaiveBestKernel(params, options, KernelType::MATRIX_NMS); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h new file mode 100644 index 00000000000..271a34207ad --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_selector.h" + +namespace kernel_selector { +class matrix_nms_kernel_selector : public kernel_selector_base { +public: + static matrix_nms_kernel_selector& Instance() { + static matrix_nms_kernel_selector instance_; + return instance_; + } + + matrix_nms_kernel_selector(); + + KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp b/src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp new file mode 100644 index 00000000000..c539e6ca08e --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "intel_gpu/primitives/matrix_nms.hpp" + +#include +#include + +#include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/program.hpp" +#include "intel_gpu/primitives/mutable_data.hpp" +#include "ngraph_ops/nms_static_shape_ie.hpp" + +namespace ngraph { +namespace op { +namespace internal { +using NmsStaticShapeIE8 = ngraph::op::internal::NmsStaticShapeIE; +} +} // namespace op +} // namespace ngraph + +namespace ov { +namespace intel_gpu { + +namespace { +void CreateNmsStaticShapeIE8Op(Program& p, const std::shared_ptr& op) { + validate_inputs_count(op, {2}); + auto inputPrimitives = p.GetInputPrimitiveIDs(op); + + std::vector shared_memory; + + auto outputIndices = op->get_output_shape(0)[0]; + cldnn::layout mutableLayoutFirst = cldnn::layout(cldnn::element_type_to_data_type(ngraph::element::i32), + cldnn::format::bfyx, + cldnn::tensor(static_cast(outputIndices), 1, 1, 1)); + + shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst)); + + cldnn::primitive_id matrix_nms_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first"; + auto matrix_nms_mutable_prim_first = cldnn::mutable_data(matrix_nms_mutable_id_w_first, shared_memory.back()); + p.add_primitive(*op, matrix_nms_mutable_prim_first); + inputPrimitives.push_back(matrix_nms_mutable_id_w_first); + + auto batches_num = op->get_output_shape(2)[0]; + cldnn::layout mutableLayoutSecond = cldnn::layout(cldnn::element_type_to_data_type(ngraph::element::i32), + cldnn::format::bfyx, + cldnn::tensor(static_cast(batches_num), 1, 1, 1)); + + shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond)); + + cldnn::primitive_id matrix_nms_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second"; + auto matrix_nms_mutable_prim_second = cldnn::mutable_data(matrix_nms_mutable_id_w_second, shared_memory.back()); + p.add_primitive(*op, matrix_nms_mutable_prim_second); + inputPrimitives.push_back(matrix_nms_mutable_id_w_second); + + auto matrixNmsLayerName = layer_type_name_ID(op) + ".out0"; + + auto prim = cldnn::matrix_nms(matrixNmsLayerName, + inputPrimitives[0], + inputPrimitives[1], + inputPrimitives[inputPrimitives.size() - 2], + inputPrimitives[inputPrimitives.size() - 1], + op->get_attrs()); + + p.add_primitive(*op, prim); + + cldnn::primitive_id matrix_nms_id_r_first = layer_type_name_ID(op) + ".out1"; + auto matrix_nms_mutable_prim_r_first = + cldnn::mutable_data(matrix_nms_id_r_first, {matrixNmsLayerName}, shared_memory.front()); + p.add_primitive(*op, matrix_nms_mutable_prim_r_first); + + cldnn::primitive_id matrix_nms_id_r_second = layer_type_name_ID(op) + ".out2"; + auto matrix_nms_mutable_prim_r_second = + cldnn::mutable_data(matrix_nms_id_r_second, {matrixNmsLayerName}, shared_memory.back()); + p.add_primitive(*op, matrix_nms_mutable_prim_r_second); +} + +} // anonymous namespace + +REGISTER_FACTORY_IMPL(internal, NmsStaticShapeIE8); + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 23930ab618d..e88028a2882 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -165,6 +166,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); manager.register_pass(); + manager.register_pass(); manager.register_pass(); manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp new file mode 100644 index 00000000000..58f82204cf0 --- /dev/null +++ b/src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp @@ -0,0 +1,673 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include + +#include "test_utils.h" + +using namespace cldnn; +using namespace tests; + +namespace { + +#define PAD -1.0 +#define PADI -1 +#define THRESHOLD 1e-3f + +template +std::vector convert(const std::vector& v) { + return {v.begin(), v.end()}; +} + +struct matrix_nms_test_inputs { + int num_butches; + int num_boxes; + int num_classes; + int num_selected_boxes; + bool sort_result_across_batch; + float score_threshold; + int nms_top_k; + int keep_top_k; + int background_class; + float gaussian_sigma; + float post_threshold; + bool normalized; + std::vector boxes_values; + std::vector scores_values; + std::vector expected_output; + std::vector expected_selected_boxes; + std::vector expected_valid_outputs; + matrix_nms::sort_result_type sort_result_type; + matrix_nms::decay_function decay_function; + std::string test_name; +}; + +using matrix_nms_test_params = std::tuple; + +template +struct matrix_nms_gpu_test : public testing::TestWithParam { +public: + void test() { + format::type blocked_format; + matrix_nms_test_inputs test_inputs; + std::tie(test_inputs, blocked_format) = testing::TestWithParam::GetParam(); + + const auto data_type = type_to_data_type::value; + const auto plain_format = format::bfyx; + + auto& engine = get_test_engine(); + + auto boxes = engine.allocate_memory( + {data_type, plain_format, tensor{test_inputs.num_butches, test_inputs.num_boxes, 1, 4}}); + auto scores = engine.allocate_memory( + {data_type, + plain_format, + tensor{test_inputs.num_butches, test_inputs.num_classes, 1, test_inputs.num_boxes}}); + + auto selected_boxes = + engine.allocate_memory({data_types::i32, plain_format, tensor{test_inputs.num_selected_boxes, 1, 1, 1}}); + auto valid_outputs = + engine.allocate_memory({data_types::i32, plain_format, tensor{test_inputs.num_butches, 1, 1, 1}}); + + set_values(boxes, convert(test_inputs.boxes_values)); + set_values(scores, convert(test_inputs.scores_values)); + + const matrix_nms::attributes attrs(test_inputs.sort_result_type, + test_inputs.sort_result_across_batch, + test_inputs.score_threshold, + test_inputs.nms_top_k, + test_inputs.keep_top_k, + test_inputs.background_class, + test_inputs.decay_function, + test_inputs.gaussian_sigma, + test_inputs.post_threshold, + test_inputs.normalized); + + topology topology; + topology.add(input_layout("boxes", boxes->get_layout())); + topology.add(input_layout("scores", scores->get_layout())); + topology.add(mutable_data("selected_boxes", selected_boxes)); + topology.add(mutable_data("valid_outputs", valid_outputs)); + + topology.add(reorder("reordered_boxes", "boxes", blocked_format, data_type)); + topology.add(reorder("reordered_scores", "scores", blocked_format, data_type)); + + topology.add(matrix_nms("reordered_matrix_nms", + "reordered_boxes", + "reordered_scores", + "selected_boxes", + "valid_outputs", + attrs)); + topology.add(reorder("matrix_nms", "reordered_matrix_nms", plain_format, data_type)); + + network network(engine, topology); + network.set_input_data("boxes", boxes); + network.set_input_data("scores", scores); + + auto outputs = network.execute(); + + auto output = outputs.at("matrix_nms").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + cldnn::mem_lock selected_boxes_ptr(selected_boxes, get_test_stream()); + cldnn::mem_lock valid_outputs_ptr(valid_outputs, get_test_stream()); + + const auto expected_output = convert(test_inputs.expected_output); + ASSERT_EQ(expected_output.size(), output_ptr.size()); + for (size_t i = 0; i < expected_output.size(); ++i) { + EXPECT_NEAR(expected_output[i], output_ptr[i], THRESHOLD); + } + + ASSERT_EQ(test_inputs.expected_selected_boxes.size(), selected_boxes_ptr.size()); + for (size_t i = 0; i < test_inputs.expected_selected_boxes.size(); ++i) { + EXPECT_EQ(test_inputs.expected_selected_boxes[i], selected_boxes_ptr[i]); + } + + ASSERT_EQ(test_inputs.expected_valid_outputs.size(), valid_outputs_ptr.size()); + for (size_t i = 0; i < test_inputs.expected_valid_outputs.size(); ++i) { + EXPECT_EQ(test_inputs.expected_valid_outputs[i], valid_outputs_ptr[i]); + } + } + + static std::string PrintToStringParamName(const testing::TestParamInfo& info) { + auto& test_inputs = std::get<0>(info.param); + std::ostringstream result; + + auto sort_res_type_str = + test_inputs.sort_result_type == matrix_nms::sort_result_type::score + ? "score" + : test_inputs.sort_result_type == matrix_nms::sort_result_type::class_id ? "class_id" : "none"; + auto decay_function_str = + test_inputs.decay_function == matrix_nms::decay_function::linear + ? "linear" + : test_inputs.decay_function == matrix_nms::decay_function::gaussian ? "gaussian" : "none"; + + result << "SortResultAcrossBatch=" << bool_to_str(test_inputs.sort_result_across_batch) << "_"; + result << "ScoreThreshold=" << test_inputs.score_threshold << "_"; + result << "NmsTopK=" << test_inputs.nms_top_k << "_"; + result << "KeepTopK=" << test_inputs.keep_top_k << "_"; + result << "BackgroundClass=" << test_inputs.background_class << "_"; + result << "GaussianSigma=" << test_inputs.gaussian_sigma << "_"; + result << "PostThreshold=" << test_inputs.post_threshold << "_"; + result << "Normalized=" << bool_to_str(test_inputs.normalized) << "_"; + result << "sort_result_type=" << sort_res_type_str << "_"; + result << "decay_function=" << decay_function_str << "_"; + result << "Format=" << fmt_to_str(std::get<1>(info.param)); + + if (!test_inputs.test_name.empty()) + result << "_TN=" << test_inputs.test_name; + + return result.str(); + } +}; + +matrix_nms_test_inputs get_matrix_nms_smoke_inputs() { + return {1, // num_butches + 6, // num_boxes + 2, // num_classes + 3, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + 0, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, // scores + std::vector{1.00, // expected_output + 0.95, + 0.00, + 0.00, + 1.00, + 1.00, + 1.00, + 0.8, + 0.00, + 10.00, + 1.00, + 11.00, + 1.00, + 0.13636364, + 0.0, + 0.1, + 1.0, + 1.1}, + std::vector{0, 3, 1}, // expected_selected_boxes + std::vector{3}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "smoke"}; +} + +matrix_nms_test_inputs get_matrix_nms_gaussian_inputs() { + return {1, // num_butches + 6, // num_boxes + 2, // num_classes + 3, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + 0, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, // scores + std::vector{1.00, // expected_output + 0.95, + 0.00, + 0.00, + 1.00, + 1.00, + 1.00, + 0.8, + 0.00, + 10.00, + 1.00, + 11.00, + 1.00, + 0.1966116, + 0.0, + 0.1, + 1.0, + 1.1}, + std::vector{0, 3, 1}, // expected_selected_boxes + std::vector{3}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::gaussian, // decay_function + "gaussian"}; +} + +matrix_nms_test_inputs get_matrix_nms_two_batches_two_classes_inputs() { + return {2, // num_butches + 6, // num_boxes + 2, // num_classes + 6, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + 0, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, + 0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // scores + 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, + std::vector{1.00, 0.95, 0.00, 0.00, 1.00, 1.00, // expected_output + 1.00, 0.8, 0.00, 10.00, 1.00, 11.00, 1.00, 0.13636364, 0.0, 0.1, + 1.0, 1.1, 1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 1.00, 0.8, + 0.00, 10.00, 1.00, 11.00, 1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1}, + std::vector{0, 3, 1, 6, 9, 7}, // expected_selected_boxes + std::vector{3, 3}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "two_batches_two_classes"}; +} + +matrix_nms_test_inputs get_matrix_nms_two_batches_two_classes_by_score_cross_batch_inputs() { + return {2, // num_butches + 6, // num_boxes + 2, // num_classes + 12, // num_selected_boxes + true, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.5f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, + 0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // scores + 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, + std::vector{0.00, 0.95, 0.00, 10.00, 1.00, 11.00, // expected_output + 1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 0.00, 0.95, 0.00, 10.00, 1.00, 11.00, 1.00, 0.95, + 0.00, 0.00, 1.00, 1.00, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, + PAD, PAD, 0.00, 0.90, 0.00, 0.00, 1.00, 1.00, 0.00, 0.90, 0.00, 0.00, 1.00, 1.00, + 1.00, 0.80, 0.00, 10.00, 1.00, 11.00, 1.00, 0.80, 0.00, 10.00, 1.00, 11.00, PAD, PAD, + PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD}, + std::vector{3, 0, 9, 6, PADI, PADI, 0, 6, 3, 9, PADI, PADI}, // expected_selected_boxes + std::vector{4, 4}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "two_batches_two_classes_by_score_cross_batch"}; +} + +matrix_nms_test_inputs get_matrix_nms_two_batches_two_classes_by_classid_cross_batch_inputs() { + return {2, // num_butches + 6, // num_boxes + 2, // num_classes + 12, // num_selected_boxes + true, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.5f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, + 0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // scores + 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, + std::vector{0.00, 0.95, 0.00, 10.00, 1.00, 11.00, // expected_output + 0.00, 0.90, 0.00, 0.00, 1.00, 1.00, 0.00, 0.95, 0.00, 10.00, 1.00, 11.00, 0.00, 0.90, + 0.00, 0.00, 1.00, 1.00, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, + PAD, PAD, 1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 1.00, 0.80, 0.00, 10.00, 1.00, 11.00, + 1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 1.00, 0.80, 0.00, 10.00, 1.00, 11.00, PAD, PAD, + PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD, PAD}, + std::vector{3, 0, 9, 6, PADI, PADI, 0, 3, 6, 9, PADI, PADI}, // expected_selected_boxes + std::vector{4, 4}, // expected_valid_output + matrix_nms::sort_result_type::class_id, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "matrix_nms_two_batches_two_classes_by_classid_cross_batch"}; +} + +matrix_nms_test_inputs get_matrix_nms_by_keep_top_k_inputs() { + return {2, // num_butches + 6, // num_boxes + 2, // num_classes + 6, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + 3, // keep_top_k + 0, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0, + 0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3, // scores + 0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, + std::vector{1.00, 0.95, 0.00, 0.00, 1.00, 1.00, // expected_output + 1.00, 0.8, 0.00, 10.00, 1.00, 11.00, 1.00, 0.13636364, 0.0, 0.1, + 1.0, 1.1, 1.00, 0.95, 0.00, 0.00, 1.00, 1.00, 1.00, 0.8, + 0.00, 10.00, 1.00, 11.00, 1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1}, + std::vector{0, 3, 1, 6, 9, 7}, // expected_selected_boxes + std::vector{3, 3}, // expected_valid_output + matrix_nms::sort_result_type::class_id, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "matrix_nms_by_keep_top_k"}; +} + +matrix_nms_test_inputs get_matrix_nms_background_inputs() { + return {1, // num_butches + 6, // num_boxes + 2, // num_classes + 6, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3}, // scores + std::vector{0.00, 0.95, 0.0, 10.0, 1.0, 11.0, // expected_output + 1.00, 0.95, 0.0, 0.0, 1.0, 1.0, 0.00, 0.9, 0.0, 0.0, + 1.0, 1.0, 1.00, 0.8, 0.0, 10.0, 1.0, 11.0, 0.00, 0.13636364, + 0.0, 0.1, 1.0, 1.1, 1.00, 0.13636364, 0.0, 0.1, 1.0, 1.1}, + std::vector{3, 0, 0, 3, 1, 1}, // expected_selected_boxes + std::vector{6}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "matrix_nms_background"}; +} + +matrix_nms_test_inputs get_matrix_nms_flipped_coordinates_inputs() { + return {1, // num_butches + 6, // num_boxes + 1, // num_classes + 3, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{1.0, 1.0, 0.0, 0.0, 0.0, 0.1, 1.0, 1.1, 0.0, 0.9, 1.0, -0.1, // boxes + 0.0, 10.0, 1.0, 11.0, 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3}, // scores + std::vector{0.00, + 0.95, + 0.0, + 10.0, + 1.0, + 11.0, // expected_output + 0.00, + 0.9, + 1.0, + 1.0, + 0.0, + 0.0, + 0.00, + 0.75, + 0.0, + 0.1, + 1.0, + 1.1}, + std::vector{3, 0, 1}, // expected_selected_boxes + std::vector{3}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "flipped_coordinates"}; +} + +matrix_nms_test_inputs get_matrix_nms_post_threshold_inputs() { + return {1, // num_butches + 6, // num_boxes + 1, // num_classes + 3, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.8f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3}, // scores + std::vector{0.00, + 0.95, + 0.00, + 10.00, + 1.00, + 11.00, // expected_output + 0.00, + 0.9, + 0.00, + 0.00, + 1.00, + 1.00, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD}, + std::vector{3, 0, PADI}, // expected_selected_boxes + std::vector{2}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "post_threshold"}; +} + +matrix_nms_test_inputs get_matrix_nms_identical_boxes_inputs() { + return {1, // num_butches + 10, // num_boxes + 1, // num_classes + 3, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.3f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, // boxes + 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}, + std::vector{0.4, 0.01, 0.2, 0.09, 0.15, 0.05, 0.02, 0.03, 0.05, 0.0}, // scores + std::vector{0.00, + 0.40, + 0.00, + 0.00, + 1.00, + 1.00, // expected_output + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD}, + std::vector{0, PADI, PADI}, // expected_selected_boxes + std::vector{1}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "identical_boxes"}; +}; + +matrix_nms_test_inputs get_matrix_nms_top_k_inputs() { + return {1, // num_butches + 6, // num_boxes + 1, // num_classes + 2, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 2, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3}, // scores + std::vector{0.00, + 0.95, + 0.00, + 10.00, + 1.00, + 11.00, // expected_output + 0.00, + 0.90, + 0.00, + 0.00, + 1.00, + 1.00}, + std::vector{3, 0}, // expected_selected_boxes + std::vector{2}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "matrix_nms_nms_top_k"}; +} + +matrix_nms_test_inputs get_matrix_nms_single_box_inputs() { + return {1, // num_butches + 1, // num_boxes + 1, // num_classes + 1, // num_selected_boxes + false, // sort_result_across_bch + 0.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0}, // boxes + std::vector{0.9}, // scores + std::vector{0.00, 0.90, 0.00, 0.00, 1.00, 1.00}, // expected_output + std::vector{0}, // expected_selected_boxes + std::vector{1}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "matrix_nms_single_box"}; +} + +matrix_nms_test_inputs get_matrix_nms_no_output_inputs() { + return {1, // num_butches + 6, // num_boxes + 1, // num_classes + 3, // num_selected_boxes + false, // sort_result_across_bch + 2.0f, // score_threshold + 3, // nms_top_k + -1, // keep_top_k + -1, // background_class + 2.0f, // gaussian_sigma + 0.0f, // post_threshold + true, // normalized + std::vector{0.0, 0.0, 1.0, 1.0, 0.0, 0.1, 1.0, 1.1, 0.0, -0.1, 1.0, 0.9, // boxes + 0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0}, + std::vector{0.9, 0.75, 0.6, 0.95, 0.5, 0.3}, // scores + std::vector{PAD, + PAD, + PAD, + PAD, + PAD, + PAD, // expected_output + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD, + PAD}, + std::vector{PADI, PADI, PADI}, // expected_selected_boxes + std::vector{0}, // expected_valid_output + matrix_nms::sort_result_type::score, // sort_result_type + matrix_nms::decay_function::linear, // decay_function + "matrix_nms_no_output"}; +} + +const std::vector layout_formats = {format::bfyx, + format::b_fs_yx_fsv16, + format::b_fs_yx_fsv32, + format::bs_fs_yx_bsv16_fsv16, + format::bs_fs_yx_bsv32_fsv32, + format::bs_fs_yx_bsv32_fsv16}; + +#define INSTANTIATE_MATRIX_NMS_TEST_SUITE(input_type, func) \ + using matrix_nms_gpu_test_##input_type##func = matrix_nms_gpu_test; \ + TEST_P(matrix_nms_gpu_test_##input_type##func, test) { \ + test(); \ + } \ + INSTANTIATE_TEST_SUITE_P(matrix_nms_test_##input_type##func, \ + matrix_nms_gpu_test_##input_type##func, \ + testing::Combine(testing::Values(func()), testing::ValuesIn(layout_formats)), \ + matrix_nms_gpu_test_##input_type##func::PrintToStringParamName); + +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_smoke_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_gaussian_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_two_batches_two_classes_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_two_batches_two_classes_by_classid_cross_batch_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_two_batches_two_classes_by_score_cross_batch_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_by_keep_top_k_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_background_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_flipped_coordinates_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_post_threshold_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_identical_boxes_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_top_k_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_single_box_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_no_output_inputs) + +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_smoke_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_gaussian_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_two_batches_two_classes_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_by_keep_top_k_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_two_batches_two_classes_by_classid_cross_batch_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_two_batches_two_classes_by_score_cross_batch_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_background_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_flipped_coordinates_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_post_threshold_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_identical_boxes_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_top_k_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_single_box_inputs) +INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_no_output_inputs) + +#undef INSTANTIATE_MATRIX_NMS_TEST_SUITE + +} // namespace diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp new file mode 100644 index 00000000000..fd615903646 --- /dev/null +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "single_layer_tests/matrix_nms.hpp" + +#include +#include + +#include "common_test_utils/test_constants.hpp" + +using namespace ov::test::subgraph; +using namespace InferenceEngine; +using namespace ngraph; + +const std::vector> inStaticShapeParams = {{{3, 100, 4}, {3, 1, 100}}, + {{1, 10, 4}, {1, 100, 10}}}; + +const auto inputPrecisions = InputPrecisions{ov::element::f32, ov::element::i32, ov::element::f32}; + +const std::vector sortResultType = {op::v8::MatrixNms::SortResultType::CLASSID, + op::v8::MatrixNms::SortResultType::SCORE, + op::v8::MatrixNms::SortResultType::NONE}; +const std::vector outType = {element::i32, element::i64}; +const std::vector topKParams = {TopKParams{-1, 5}, TopKParams{100, -1}}; +const std::vector thresholdParams = {ThresholdParams{0.0f, 2.0f, 0.0f}, + ThresholdParams{0.1f, 1.5f, 0.2f}}; +const std::vector backgroudClass = {-1, 1}; +const std::vector normalized = {true, false}; +const std::vector decayFunction = {op::v8::MatrixNms::DecayFunction::GAUSSIAN, + op::v8::MatrixNms::DecayFunction::LINEAR}; + +const auto nmsParamsStatic = + ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inStaticShapeParams)), + ::testing::Values(inputPrecisions), + ::testing::ValuesIn(sortResultType), + ::testing::ValuesIn(outType), + ::testing::ValuesIn(topKParams), + ::testing::ValuesIn(thresholdParams), + ::testing::ValuesIn(backgroudClass), + ::testing::ValuesIn(normalized), + ::testing::ValuesIn(decayFunction), + ::testing::Values(CommonTestUtils::DEVICE_GPU)); + +INSTANTIATE_TEST_SUITE_P(smoke_MatrixNmsLayerTest_static, + MatrixNmsLayerTest, + nmsParamsStatic, + MatrixNmsLayerTest::getTestCaseName); diff --git a/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp b/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp index 3c1e2c418e9..6d8f3c79de0 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp @@ -325,7 +325,10 @@ void MatrixNmsLayerTest::SetUp() { const auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); auto nms = std::make_shared(paramOuts[0], paramOuts[1], m_attrs); - if (!m_outStaticShape) { + + if (targetDevice == CommonTestUtils::DEVICE_GPU) { + function = std::make_shared(nms, params, "MatrixNMS"); + } else if (!m_outStaticShape) { auto result = std::make_shared(nms); function = std::make_shared(result, params, "MatrixNMS"); } else {