From 57571d36e6c9717d5f73dfb54cbe5b8ff4fa8361 Mon Sep 17 00:00:00 2001 From: Nikolay Shchegolev Date: Tue, 31 Oct 2023 16:10:52 +0400 Subject: [PATCH] [CPU] NMSRotated operation implementation. (#20410) --- .../sort/NMSRotated_13.md | 4 +- .../src/transformations/convert_precision.cpp | 47 + src/plugins/intel_cpu/src/cpu_types.cpp | 1 + src/plugins/intel_cpu/src/node.cpp | 37 +- src/plugins/intel_cpu/src/node.h | 1 + .../nodes/kernels/x64/non_max_suppression.cpp | 465 +++++ .../nodes/kernels/x64/non_max_suppression.hpp | 152 ++ .../src/nodes/non_max_suppression.cpp | 1695 ++++++++--------- .../intel_cpu/src/nodes/non_max_suppression.h | 172 +- .../skip_tests_config.cpp | 2 + .../instances/common/nms_rotated.cpp | 95 + .../non_max_suppression.cpp | 30 +- .../include/single_op_tests/nms_rotated.hpp | 15 + .../single_op/nms_rotated.hpp | 47 + .../src/single_op/nms_rotated.cpp | 207 ++ .../skip_configs/CPU/expected_failures_OP.csv | 1 - 16 files changed, 1906 insertions(+), 1065 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp create mode 100644 src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp create mode 100644 src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp create mode 100644 src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp create mode 100644 src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp diff --git a/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md b/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md index 5ae29954802..964f9bdb522 100644 --- a/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md +++ b/docs/articles_en/documentation/openvino_ir/operation_sets/operations_specifications/sort/NMSRotated_13.md @@ -27,13 +27,13 @@ The general algorithm is described below: Here ``func(rotated_iou(b_i, b)) = 1 if rotated_iou(b_i, b) <= iou_threshold else 0``. -Having two bouding boxes ``B1`` and ``B2`` the following steps are performed to calculate ``rotated_iou(B1, B2)``: +Having two bounding boxes ``B1`` and ``B2`` the following steps are performed to calculate ``rotated_iou(B1, B2)``: 1. Calculate rotated vertices, (x, y) coordinates of the 4 corners of each box transformed by the corresponding angle in radians according to the direction specified by the *clockwise* attribute. 2. Find all intersection points between edges of ``B1`` and ``B2``. Add them to the ``intersection_points``. 3. Find all corners of ``B1`` within area of ``B2``, and all corners of ``B2`` within area of ``B1``. Add them to the ``intersection_points``. 4. Calculate ``intersection_area`` of the polygon described by ``intersection_points`` (see Sholeace formula). -5. Calculate ``union_area`` (the common area of ``B1`` and ``B2``), `union_area = (B1_area + B2_area) - intersection_area`. +5. Calculate ``union_area`` (the common area of ``B1`` and ``B2``), `union_area = B1_area + B2_area`. 6. Return intersection over union ``rotated_iou = intersection_area / (union_area - intersection_area)``. diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index a1e9dd7a820..4fd52934dd4 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -49,6 +49,7 @@ bool fuse_type_to_nms3(const std::shared_ptr& node, const precisions_m bool fuse_type_to_nms4(const std::shared_ptr& node, const precisions_map& precisions); bool fuse_type_to_nms5(const std::shared_ptr& node, const precisions_map& precisions); bool fuse_type_to_nms9(const std::shared_ptr& node, const precisions_map& precisions); +bool fuse_type_to_nms_rotated(const std::shared_ptr& node, const precisions_map& precisions); bool fuse_type_to_matrix_nms(const std::shared_ptr& node, const precisions_map& precisions); bool fuse_type_to_multiclass_nms(const std::shared_ptr& node, const precisions_map& precisions); bool fuse_type_to_generate_proposals(const std::shared_ptr& node, const precisions_map& precisions); @@ -383,6 +384,7 @@ bool ov::pass::ConvertPrecision::run_on_model(const std::shared_ptr& {opset4::NonMaxSuppression::get_type_info_static(), fuse_type_to_nms4}, {opset5::NonMaxSuppression::get_type_info_static(), fuse_type_to_nms5}, {opset9::NonMaxSuppression::get_type_info_static(), fuse_type_to_nms9}, + {op::v13::NMSRotated::get_type_info_static(), fuse_type_to_nms_rotated}, {opset8::MatrixNms::get_type_info_static(), fuse_type_to_matrix_nms}, {opset8::MulticlassNms::get_type_info_static(), fuse_type_to_multiclass_nms}, {opset9::MulticlassNms::get_type_info_static(), fuse_type_to_multiclass_nms}, @@ -691,6 +693,51 @@ bool fuse_type_to_nms9(const std::shared_ptr& node, const precisions_m return res; } +bool fuse_type_to_nms_rotated(const std::shared_ptr& node, const precisions_map& precisions) { + auto nms = ov::as_type_ptr(node); + if (!nms) { + return false; + } + + bool res = false; + auto it = precisions.find(node->get_output_element_type(0)); + if (it != precisions.end()) { + const auto& to = it->second; + if (to == ov::element::i32 || to == ov::element::i64) { + nms->set_output_type_attr(to); + res = true; + if (precisions.count(node->get_output_element_type(1)) == 0) { + return res; + } + } + } + + auto type_relaxed = std::dynamic_pointer_cast(node); + ov::element::TypeVector output_types; + for (size_t i = 0; i < node->get_output_size(); i++) { + it = precisions.find(node->get_output_element_type(i)); + if (it == precisions.end()) { + output_types.push_back(node->get_output_element_type(i)); + continue; + } + const auto& to = it->second; + if (type_relaxed) { + type_relaxed->set_overridden_output_type(to, i); + res = true; + } + output_types.push_back(to); + } + + if (!type_relaxed) { + auto relaxed_op = + std::make_shared>(*nms, ov::element::TypeVector{}, output_types); + replace_node(node, relaxed_op); + res = true; + } + + return res; +} + namespace { bool update_type(size_t idx, diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 139685f5882..56cdbe32a2d 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -201,6 +201,7 @@ static const TypeToNameMap& get_type_to_name_tbl() { { "ExtractImagePatches", Type::ExtractImagePatches}, { "NonMaxSuppression", Type::NonMaxSuppression}, { "NonMaxSuppressionIEInternal", Type::NonMaxSuppression}, + { "NMSRotated", Type::NonMaxSuppression}, { "MatrixNms", Type::MatrixNms}, { "MulticlassNms", Type::MulticlassNms}, { "MulticlassNmsIEInternal", Type::MulticlassNms}, diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index ab02ae44dd6..c36815ee048 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -615,26 +615,31 @@ bool Node::outputShapeDataDependency() const { void Node::redefineOutputMemory(const std::vector &newOutputShapes) { if (newOutputShapes.size() != outputShapes.size()) { - IE_THROW() << "Number shapes mismatch with real outputs number for node with name: " << getName(); + THROW_CPU_NODE_ERR("has shapes number mismatch with real outputs number."); } - for (size_t i = 0; i < outputShapes.size(); i++) { - const auto edges = getChildEdgesAtPort(i); + for (size_t i = 0lu; i < outputShapes.size(); i++) { + redefineOutputMemory(i, newOutputShapes[i]); + } +} - // avoid 0D shape incompatible - auto newOutputShape = newOutputShapes[i]; - if (newOutputShape.empty()) { - newOutputShape.push_back(1); - } +void Node::redefineOutputMemory(const size_t port, const VectorDims& new_output_shape) { + const auto edges = getChildEdgesAtPort(port); - const auto &currDesc = edges[0]->getMemory().getDesc(); - if (currDesc.getShape().isStatic() && currDesc.getShape().getStaticDims() == newOutputShape) - continue; + // avoid 0D shape incompatible + auto new_shape = new_output_shape; + if (new_shape.empty()) { + new_shape.push_back(1); + } - const bool hasZeroDims = std::count(std::begin(newOutputShape), std::end(newOutputShape), 0) > 0; - const auto memDesc = getBaseMemDescAtOutputPort(i)->cloneWithNewDims(newOutputShape, hasZeroDims); - for (size_t j = 0; j < edges.size(); j++) { - edges[j]->getMemoryPtr()->redefineDesc(memDesc); - } + const auto& curr_desc = edges[0]->getMemory().getDesc(); + if (curr_desc.getShape().isStatic() && curr_desc.getShape().getStaticDims() == new_shape) { + return; + } + + const bool has_zero_dims = std::count(std::begin(new_shape), std::end(new_shape), 0lu) > 0; + const auto mem_desc = getBaseMemDescAtOutputPort(port)->cloneWithNewDims(new_shape, has_zero_dims); + for (size_t j = 0lu; j < edges.size(); j++) { + edges[j]->getMemoryPtr()->redefineDesc(mem_desc); } } diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index 864c08a95b0..4b6fa3a87f7 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -366,6 +366,7 @@ public: void updateDynamicParams(); void executeDynamic(dnnl::stream strm); virtual void redefineOutputMemory(const std::vector &newShapes); + void redefineOutputMemory(const size_t port, const VectorDims& new_output_shape); bool outputShapeDataDependency() const; virtual void initSupportedPrimitiveDescriptors(); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp new file mode 100644 index 00000000000..f9c665ec9c5 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.cpp @@ -0,0 +1,465 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "non_max_suppression.hpp" +#include "utils/general_utils.h" + +using namespace InferenceEngine; +using namespace dnnl::impl::cpu; + +#define GET_OFF(field) offsetof(NmsCallArgs, field) + +namespace ov { +namespace intel_cpu { +namespace kernel { + +template +void NonMaxSuppression::generate() { + load_vector_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, vector_step)); + load_scalar_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, scalar_step)); + + exp_injector.reset(new x64::jit_uni_eltwise_injector_f32(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.f)); + + this->preamble(); + + uni_vpxor(vmm_zero, vmm_zero, vmm_zero); + + load_pool_gpr_idxs = {static_cast(reg_load_store_mask.getIdx()), static_cast(reg_load_table.getIdx())}; + store_pool_gpr_idxs = {static_cast(reg_load_store_mask.getIdx())}; + store_pool_vec_idxs = {static_cast(vmm_zero.getIdx())}; + + mov(reg_boxes_coord0, ptr[reg_params + GET_OFF(selected_boxes_coord[0])]); + mov(reg_boxes_coord1, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 1 * sizeof(size_t)]); + mov(reg_boxes_coord2, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 2 * sizeof(size_t)]); + mov(reg_boxes_coord3, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 3 * sizeof(size_t)]); + mov(reg_candidate_box, ptr[reg_params + GET_OFF(candidate_box)]); + mov(reg_candidate_status, ptr[reg_params + GET_OFF(candidate_status)]); + mov(reg_boxes_num, ptr[reg_params + GET_OFF(selected_boxes_num)]); + mov(reg_iou_threshold, ptr[reg_params + GET_OFF(iou_threshold)]); + // soft + mov(reg_score_threshold, ptr[reg_params + GET_OFF(score_threshold)]); + mov(reg_score, ptr[reg_params + GET_OFF(score)]); + mov(reg_scale, ptr[reg_params + GET_OFF(scale)]); + + // could use rcx(reg_table) and rdi(reg_temp) now as abi parse finished + mov(reg_table, l_table_constant); + if (x64::mayiuse(x64::avx512_core)) { + kmovw(k_mask_one, word[reg_table + vlen]); + } + uni_vbroadcastss(vmm_iou_threshold, ptr[reg_iou_threshold]); + uni_vbroadcastss(vmm_score_threshold, ptr[reg_score_threshold]); + + uni_vbroadcastss(vmm_candidate_coord0, ptr[reg_candidate_box]); + uni_vbroadcastss(vmm_candidate_coord1, ptr[reg_candidate_box + 1 * sizeof(float)]); + uni_vbroadcastss(vmm_candidate_coord2, ptr[reg_candidate_box + 2 * sizeof(float)]); + uni_vbroadcastss(vmm_candidate_coord3, ptr[reg_candidate_box + 3 * sizeof(float)]); + + if (m_jcp.box_encode_type == NMSBoxEncodeType::CORNER) { + // box format: y1, x1, y2, x2 + uni_vminps(vmm_temp1, vmm_candidate_coord0, vmm_candidate_coord2); + uni_vmaxps(vmm_temp2, vmm_candidate_coord0, vmm_candidate_coord2); + uni_vmovups(vmm_candidate_coord0, vmm_temp1); + uni_vmovups(vmm_candidate_coord2, vmm_temp2); + + uni_vminps(vmm_temp1, vmm_candidate_coord1, vmm_candidate_coord3); + uni_vmaxps(vmm_temp2, vmm_candidate_coord1, vmm_candidate_coord3); + uni_vmovups(vmm_candidate_coord1, vmm_temp1); + uni_vmovups(vmm_candidate_coord3, vmm_temp2); + } else { + // box format: x_center, y_center, width, height --> y1, x1, y2, x2 + uni_vmulps(vmm_temp1, vmm_candidate_coord2, ptr[reg_table]); // width/2 + uni_vmulps(vmm_temp2, vmm_candidate_coord3, ptr[reg_table]); // height/2 + + uni_vaddps(vmm_temp3, vmm_candidate_coord0, vmm_temp1); // x_center + width/2 + uni_vmovups(vmm_candidate_coord3, vmm_temp3); + + uni_vaddps(vmm_temp3, vmm_candidate_coord1, vmm_temp2); // y_center + height/2 + uni_vmovups(vmm_candidate_coord2, vmm_temp3); + + uni_vsubps(vmm_temp3, vmm_candidate_coord0, vmm_temp1); // x_center - width/2 + uni_vsubps(vmm_temp4, vmm_candidate_coord1, vmm_temp2); // y_center - height/2 + + uni_vmovups(vmm_candidate_coord1, vmm_temp3); + uni_vmovups(vmm_candidate_coord0, vmm_temp4); + } + + // check from last to first + imul(reg_temp_64, reg_boxes_num, sizeof(float)); + add(reg_boxes_coord0, reg_temp_64); // y1 + add(reg_boxes_coord1, reg_temp_64); // x1 + add(reg_boxes_coord2, reg_temp_64); // y2 + add(reg_boxes_coord3, reg_temp_64); // x2 + + Xbyak::Label hard_nms_label; + Xbyak::Label nms_end_label; + + mov(reg_temp_32, ptr[reg_scale]); + test(reg_temp_32, reg_temp_32); + jz(hard_nms_label, T_NEAR); + + soft_nms(); + + jmp(nms_end_label, T_NEAR); + + L(hard_nms_label); + + hard_nms(); + + L(nms_end_label); + + this->postamble(); + + load_vector_emitter->emit_data(); + load_scalar_emitter->emit_data(); + + prepare_table(); + exp_injector->prepare_table(); +} + + +template +void NonMaxSuppression::hard_nms() { + Xbyak::Label main_loop_label_hard; + Xbyak::Label main_loop_end_label_hard; + Xbyak::Label tail_loop_label_hard; + Xbyak::Label terminate_label_hard; + L(main_loop_label_hard); + { + cmp(reg_boxes_num, vector_step); + jl(main_loop_end_label_hard, T_NEAR); + + sub(reg_boxes_coord0, vector_step * sizeof(float)); + sub(reg_boxes_coord1, vector_step * sizeof(float)); + sub(reg_boxes_coord2, vector_step * sizeof(float)); + sub(reg_boxes_coord3, vector_step * sizeof(float)); + + // iou result is in vmm_temp3 + iou(vector_step); + + sub(reg_boxes_num, vector_step); + + suppressed_by_iou(false); + + // if zero continue, else set result to suppressed and terminate + jz(main_loop_label_hard, T_NEAR); + + uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0); + + jmp(terminate_label_hard, T_NEAR); + } + L(main_loop_end_label_hard); + + L(tail_loop_label_hard); + { + cmp(reg_boxes_num, 1); + jl(terminate_label_hard, T_NEAR); + + sub(reg_boxes_coord0, scalar_step * sizeof(float)); + sub(reg_boxes_coord1, scalar_step * sizeof(float)); + sub(reg_boxes_coord2, scalar_step * sizeof(float)); + sub(reg_boxes_coord3, scalar_step * sizeof(float)); + + // iou result is in vmm_temp3 + iou(scalar_step); + + sub(reg_boxes_num, scalar_step); + + suppressed_by_iou(true); + + jz(tail_loop_label_hard, T_NEAR); + + uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0); + + jmp(terminate_label_hard, T_NEAR); + } + + L(terminate_label_hard); +} + +template +void NonMaxSuppression::soft_nms() { + uni_vbroadcastss(vmm_scale, ptr[reg_scale]); + + Xbyak::Label main_loop_label; + Xbyak::Label main_loop_end_label; + Xbyak::Label tail_loop_label; + Xbyak::Label terminate_label; + + Xbyak::Label main_loop_label_soft; + Xbyak::Label tail_loop_label_soft; + L(main_loop_label); + { + cmp(reg_boxes_num, vector_step); + jl(main_loop_end_label, T_NEAR); + + sub(reg_boxes_coord0, vector_step * sizeof(float)); + sub(reg_boxes_coord1, vector_step * sizeof(float)); + sub(reg_boxes_coord2, vector_step * sizeof(float)); + sub(reg_boxes_coord3, vector_step * sizeof(float)); + + // result(iou and weight) is in vmm_temp3 + iou(vector_step); + sub(reg_boxes_num, vector_step); + + // soft suppressed by iou_threshold + if (m_jcp.is_soft_suppressed_by_iou) { + suppressed_by_iou(false); + + // if zero continue soft suppression, else set result to suppressed and terminate + jz(main_loop_label_soft, T_NEAR); + + uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0); + + jmp(terminate_label, T_NEAR); + + L(main_loop_label_soft); + } + + // weight: std::exp(scale * iou * iou) + soft_coeff(); + + // vector weights multiply + horizontal_mul(); + + uni_vbroadcastss(vmm_temp1, ptr[reg_score]); + + // new score in vmm3[0] + uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1); + // store new score + uni_vmovss(ptr[reg_score], vmm_temp3); + + // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold + suppressed_by_score(); + + jz(main_loop_label, T_NEAR); + + uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0); + + jmp(terminate_label, T_NEAR); + } + L(main_loop_end_label); + + L(tail_loop_label); + { + cmp(reg_boxes_num, 1); + jl(terminate_label, T_NEAR); + + sub(reg_boxes_coord0, scalar_step * sizeof(float)); + sub(reg_boxes_coord1, scalar_step * sizeof(float)); + sub(reg_boxes_coord2, scalar_step * sizeof(float)); + sub(reg_boxes_coord3, scalar_step * sizeof(float)); + + iou(scalar_step); + sub(reg_boxes_num, scalar_step); + + // soft suppressed by iou_threshold + if (m_jcp.is_soft_suppressed_by_iou) { + suppressed_by_iou(true); + + jz(tail_loop_label_soft, T_NEAR); + + uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0); + + jmp(terminate_label, T_NEAR); + + L(tail_loop_label_soft); + } + + soft_coeff(); + + uni_vbroadcastss(vmm_temp1, ptr[reg_score]); + + // vmm3[0] is valide, no need horizontal mul. + uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1); + + uni_vmovss(ptr[reg_score], vmm_temp3); + + // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold + suppressed_by_score(); + + jz(tail_loop_label, T_NEAR); + + uni_vpextrd(ptr[reg_candidate_status], Xbyak::Xmm(vmm_zero.getIdx()), 0); + + jmp(terminate_label, T_NEAR); + } + + L(terminate_label); +} + +template +void NonMaxSuppression::suppressed_by_iou(bool is_scalar) { + if (x64::mayiuse(x64::avx512_core)) { + vcmpps(k_mask, vmm_temp3, vmm_iou_threshold, 0x0D); // _CMP_GE_OS. vcmpps w/ kmask only on V5 + if (is_scalar) + kandw(k_mask, k_mask, k_mask_one); + kortestw(k_mask, k_mask); // bitwise check if all zero + } else if (x64::mayiuse(x64::avx)) { + // vex instructions with xmm on avx and ymm on avx2 + vcmpps(vmm_temp4, vmm_temp3, vmm_iou_threshold, 0x0D); // xmm and ymm only on V1. + if (is_scalar) { + uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp4.getIdx()), 0); + test(reg_temp_32, reg_temp_32); + } else { + uni_vtestps(vmm_temp4, vmm_temp4); // vtestps: sign bit check if all zeros, ymm and xmm only on V1, N/A on V5 + } + } else { + // pure sse path, make sure don't spoil vmm_temp3, which may used in after soft-suppression + uni_vmovups(vmm_temp4, vmm_temp3); + cmpps(vmm_temp4, vmm_iou_threshold, 0x07); // order compare, 0 for at least one is NaN + + uni_vmovups(vmm_temp2, vmm_temp3); + cmpps(vmm_temp2, vmm_iou_threshold, 0x05); // _CMP_GE_US on sse, no direct _CMP_GE_OS supported. + + uni_vandps(vmm_temp4, vmm_temp4, vmm_temp2); + if (is_scalar) { + uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp4.getIdx()), 0); + test(reg_temp_32, reg_temp_32); + } else { + uni_vtestps(vmm_temp4, vmm_temp4); // ptest: bitwise check if all zeros, on sse41 + } + } +} + +template +void NonMaxSuppression::suppressed_by_score() { + if (x64::mayiuse(x64::avx512_core)) { + vcmpps(k_mask, vmm_temp3, vmm_score_threshold, 0x02); // vcmpps w/ kmask only on V5, w/o kmask version N/A on V5 + kandw(k_mask, k_mask, k_mask_one); + kortestw(k_mask, k_mask); // bitwise check if all zero + } else if (x64::mayiuse(x64::avx)) { + vcmpps(vmm_temp4, vmm_temp3, vmm_score_threshold, 0x02); + uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp4.getIdx()), 0); + test(reg_temp_32, reg_temp_32); + } else { + cmpps(vmm_temp3, vmm_score_threshold, 0x02); // _CMP_LE_OS on sse + uni_vpextrd(reg_temp_32, Xbyak::Xmm(vmm_temp3.getIdx()), 0); + test(reg_temp_32, reg_temp_32); + } +} + +template +void NonMaxSuppression::iou(int ele_num) { + auto load = [&](Xbyak::Reg64 reg_src, Vmm vmm_dst) { + if (ele_num != scalar_step && ele_num != vector_step) + OPENVINO_THROW("NMS JIT implementation supports load emitter with only element count scalar_step or vector_step! Get: ", ele_num); + + const auto& load_emitter = ele_num == 1 ? load_scalar_emitter : load_vector_emitter; + load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_dst.getIdx())}, + {}, {load_pool_gpr_idxs}); + }; + load(reg_boxes_coord0, vmm_boxes_coord0); + load(reg_boxes_coord1, vmm_boxes_coord1); + load(reg_boxes_coord2, vmm_boxes_coord2); + load(reg_boxes_coord3, vmm_boxes_coord3); + + if (m_jcp.box_encode_type == NMSBoxEncodeType::CORNER) { + // box format: y1, x1, y2, x2 + uni_vminps(vmm_temp1, vmm_boxes_coord0, vmm_boxes_coord2); + uni_vmaxps(vmm_temp2, vmm_boxes_coord0, vmm_boxes_coord2); + uni_vmovups(vmm_boxes_coord0, vmm_temp1); + uni_vmovups(vmm_boxes_coord2, vmm_temp2); + + uni_vminps(vmm_temp1, vmm_boxes_coord1, vmm_boxes_coord3); + uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_boxes_coord3); + uni_vmovups(vmm_boxes_coord1, vmm_temp1); + uni_vmovups(vmm_boxes_coord3, vmm_temp2); + } else { + // box format: x_center, y_center, width, height --> y1, x1, y2, x2 + uni_vmulps(vmm_temp1, vmm_boxes_coord2, ptr[reg_table]); // width/2 + uni_vmulps(vmm_temp2, vmm_boxes_coord3, ptr[reg_table]); // height/2 + + uni_vaddps(vmm_temp3, vmm_boxes_coord0, vmm_temp1); // x_center + width/2 + uni_vmovups(vmm_boxes_coord3, vmm_temp3); + + uni_vaddps(vmm_temp3, vmm_boxes_coord1, vmm_temp2); // y_center + height/2 + uni_vmovups(vmm_boxes_coord2, vmm_temp3); + + uni_vsubps(vmm_temp3, vmm_boxes_coord0, vmm_temp1); // x_center - width/2 + uni_vsubps(vmm_temp4, vmm_boxes_coord1, vmm_temp2); // y_center - height/2 + + uni_vmovups(vmm_boxes_coord1, vmm_temp3); + uni_vmovups(vmm_boxes_coord0, vmm_temp4); + } + + uni_vsubps(vmm_temp1, vmm_boxes_coord2, vmm_boxes_coord0); + uni_vsubps(vmm_temp2, vmm_boxes_coord3, vmm_boxes_coord1); + uni_vmulps(vmm_temp1, vmm_temp1, vmm_temp2); // boxes area + + uni_vsubps(vmm_temp2, vmm_candidate_coord2, vmm_candidate_coord0); + uni_vsubps(vmm_temp3, vmm_candidate_coord3, vmm_candidate_coord1); + uni_vmulps(vmm_temp2, vmm_temp2, vmm_temp3); // candidate(bc) area // candidate area calculate once and check if 0 + + uni_vaddps(vmm_temp1, vmm_temp1, vmm_temp2); // areaI + areaJ to free vmm_temp2 + + // y of intersection + uni_vminps(vmm_temp3, vmm_boxes_coord2, vmm_candidate_coord2); // min(Ymax) + uni_vmaxps(vmm_temp4, vmm_boxes_coord0, vmm_candidate_coord0); // max(Ymin) + uni_vsubps(vmm_temp3, vmm_temp3, vmm_temp4); // min(Ymax) - max(Ymin) + uni_vmaxps(vmm_temp3, vmm_temp3, vmm_zero); + + // x of intersection + uni_vminps(vmm_temp4, vmm_boxes_coord3, vmm_candidate_coord3); // min(Xmax) + uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_candidate_coord1); // max(Xmin) + uni_vsubps(vmm_temp4, vmm_temp4, vmm_temp2); // min(Xmax) - max(Xmin) + uni_vmaxps(vmm_temp4, vmm_temp4, vmm_zero); + + // intersection_area + uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp4); + + // iou: intersection_area / (areaI + areaJ - intersection_area); + uni_vsubps(vmm_temp1, vmm_temp1, vmm_temp3); + uni_vdivps(vmm_temp3, vmm_temp3, vmm_temp1); +} + +// std::exp(scale * iou * iou) +template +void NonMaxSuppression::soft_coeff() { + uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp3); + uni_vmulps(vmm_temp3, vmm_temp3, vmm_scale); + exp_injector->compute_vector_range(vmm_temp3.getIdx(), vmm_temp3.getIdx() + 1); +} + +template +void NonMaxSuppression::horizontal_mul_xmm(const Xbyak::Xmm &xmm_weight, const Xbyak::Xmm &xmm_aux) { + uni_vmovshdup(xmm_aux, xmm_weight); // weight:1,2,3,4; aux:2,2,4,4 + uni_vmulps(xmm_weight, xmm_weight, xmm_aux); // weight:1*2,2*2,3*4,4*4 + uni_vmovhlps(xmm_aux, xmm_aux, xmm_weight); // aux:3*4,4*4,4,4 + uni_vmulps(xmm_weight, xmm_weight, xmm_aux); // weight:1*2*3*4,... +} + +// horizontal mul for vmm_weight(Vmm(3)), temp1 and temp2 as aux +template +inline void NonMaxSuppression::horizontal_mul() { + Xbyak::Xmm xmm_weight = Xbyak::Xmm(vmm_temp3.getIdx()); + Xbyak::Xmm xmm_temp1 = Xbyak::Xmm(vmm_temp1.getIdx()); + Xbyak::Xmm xmm_temp2 = Xbyak::Xmm(vmm_temp2.getIdx()); + if (isa == x64::sse41) { + horizontal_mul_xmm(xmm_weight, xmm_temp1); + } else if (isa == x64::avx2) { + Xbyak::Ymm ymm_weight = Xbyak::Ymm(vmm_temp3.getIdx()); + vextractf128(xmm_temp1, ymm_weight, 0); + vextractf128(xmm_temp2, ymm_weight, 1); + uni_vmulps(xmm_weight, xmm_temp1, xmm_temp2); + horizontal_mul_xmm(xmm_weight, xmm_temp1); + } else { + Xbyak::Zmm zmm_weight = Xbyak::Zmm(vmm_temp3.getIdx()); + vextractf32x4(xmm_temp1, zmm_weight, 0); + vextractf32x4(xmm_temp2, zmm_weight, 1); + uni_vmulps(xmm_temp1, xmm_temp1, xmm_temp2); + vextractf32x4(xmm_temp2, zmm_weight, 2); + vextractf32x4(xmm_weight, zmm_weight, 3); + uni_vmulps(xmm_weight, xmm_weight, xmm_temp2); + uni_vmulps(xmm_weight, xmm_weight, xmm_temp1); + horizontal_mul_xmm(xmm_weight, xmm_temp1); + } +} + +template class NonMaxSuppression; +template class NonMaxSuppression; +template class NonMaxSuppression; + +} // namespace kernel +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp new file mode 100644 index 00000000000..859f687db8d --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/x64/non_max_suppression.hpp @@ -0,0 +1,152 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "jit_kernel_base.hpp" + +#if defined(OPENVINO_ARCH_X86_64) +#include "emitters/x64/jit_load_store_emitters.hpp" +#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp" +#endif // OPENVINO_ARCH_X86_64 + +namespace ov { +namespace intel_cpu { + +enum class NMSBoxEncodeType { + CORNER, + CENTER +}; + +#if defined(OPENVINO_ARCH_X86_64) + +namespace kernel { + +struct NmsCompileParams { + NMSBoxEncodeType box_encode_type; + bool is_soft_suppressed_by_iou; +}; + +struct NmsCallArgs { + const void* selected_boxes_coord[4]; + size_t selected_boxes_num; + const void* candidate_box; + const void* iou_threshold; + void* candidate_status; + // for soft suppression, score *= scale * iou * iou; + const void* score_threshold; + const void* scale; + void* score; +}; + + +template +class NonMaxSuppression : public JitKernel { +public: + DECLARE_CPU_JIT_AUX_FUNCTIONS(NonMaxSuppression) + + explicit NonMaxSuppression(const NmsCompileParams& jcp) : JitKernel(jit_name(), jcp, isa) {} + + void generate() override; + +private: + using Vmm = typename dnnl::impl::utils::conditional3::type; + uint32_t vlen = dnnl::impl::cpu::x64::cpu_isa_traits::vlen; + const int vector_step = vlen / sizeof(float); + const int scalar_step = 1; + + Xbyak::Reg64 reg_boxes_coord0 = r8; + Xbyak::Reg64 reg_boxes_coord1 = r9; + Xbyak::Reg64 reg_boxes_coord2 = r10; + Xbyak::Reg64 reg_boxes_coord3 = r11; + Xbyak::Reg64 reg_candidate_box = r12; + Xbyak::Reg64 reg_candidate_status = r13; + Xbyak::Reg64 reg_boxes_num = r14; + Xbyak::Reg64 reg_iou_threshold = r15; + // more for soft + Xbyak::Reg64 reg_score_threshold = rdx; + Xbyak::Reg64 reg_score = rbp; + Xbyak::Reg64 reg_scale = rsi; + + Xbyak::Reg64 reg_load_table = rax; + Xbyak::Reg64 reg_load_store_mask = rbx; + + // reuse + Xbyak::Label l_table_constant; + Xbyak::Reg64 reg_table = rcx; + Xbyak::Reg64 reg_temp_64 = rdi; + Xbyak::Reg32 reg_temp_32 = edi; + + const Xbyak::Reg64 reg_params = Xbyak::Reg64(dnnl::impl::cpu::x64::abi_param_regs[0]); + + std::unique_ptr load_vector_emitter = nullptr; + std::unique_ptr load_scalar_emitter = nullptr; + + std::vector store_pool_gpr_idxs; + std::vector store_pool_vec_idxs; + std::vector load_pool_gpr_idxs; + + Vmm vmm_boxes_coord0 = Vmm(1); + Vmm vmm_boxes_coord1 = Vmm(2); + Vmm vmm_boxes_coord2 = Vmm(3); + Vmm vmm_boxes_coord3 = Vmm(4); + Vmm vmm_candidate_coord0 = Vmm(5); + Vmm vmm_candidate_coord1 = Vmm(6); + Vmm vmm_candidate_coord2 = Vmm(7); + Vmm vmm_candidate_coord3 = Vmm(8); + Vmm vmm_temp1 = Vmm(9); + Vmm vmm_temp2 = Vmm(10); + Vmm vmm_temp3 = Vmm(11); + Vmm vmm_temp4 = Vmm(12); + + Vmm vmm_iou_threshold = Vmm(13); + Vmm vmm_zero = Vmm(15); + + // soft + Vmm vmm_score_threshold = Vmm(14); + Vmm vmm_scale = Vmm(0); + + Xbyak::Opmask k_mask = Xbyak::Opmask(7); + Xbyak::Opmask k_mask_one = Xbyak::Opmask(6); + + std::shared_ptr> exp_injector; + + inline void hard_nms(); + + inline void soft_nms(); + + inline void suppressed_by_iou(bool is_scalar); + + inline void suppressed_by_score(); + + inline void iou(int ele_num); + + inline void soft_coeff(); + + inline void horizontal_mul_xmm(const Xbyak::Xmm& xmm_weight, const Xbyak::Xmm& xmm_aux); + + inline void horizontal_mul(); + + inline void prepare_table() { + auto broadcast_d = [&](int val) { + for (size_t d = 0; d < vlen / sizeof(int); ++d) { + dd(val); + } + }; + + align(64); + L(l_table_constant); + broadcast_d(0x3f000000); // 0.5f + dw(0x0001); + } +}; + +} // namespace kernel + +#endif // OPENVINO_ARCH_X86_64 + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp index d2a46ac97da..79112a3afa3 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.cpp @@ -1,571 +1,41 @@ // Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +// Copyright (c) Facebook, Inc. and its affiliates. +// The implementation for rotated boxes intersection is based on the code from: +// https://github.com/facebookresearch/detectron2/blob/v0.6/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h +// -#include -#include -#include -#include -#include -#include #include "non_max_suppression.h" -#include "ie_parallel.hpp" -#include -#include -#include "utils/general_utils.h" -#include "cpu/x64/jit_generator.hpp" -#include "emitters/x64/jit_load_store_emitters.hpp" -#include -#include +#include "ie_parallel.hpp" +#include "utils/general_utils.h" +#include "shape_inference/shape_inference_internal_dyn.hpp" +#include "openvino/op/nms_rotated.hpp" +#include "openvino/op/non_max_suppression.hpp" +#include "ov_ops/nms_ie_internal.hpp" + +#include using namespace InferenceEngine; -using namespace dnnl; -using namespace dnnl::impl; -using namespace dnnl::impl::cpu::x64; -using namespace dnnl::impl::utils; -using namespace Xbyak; - -#define GET_OFF(field) offsetof(jit_nms_args, field) namespace ov { namespace intel_cpu { namespace node { -#if defined(OPENVINO_ARCH_X86_64) -template -struct jit_uni_nms_kernel_f32 : public jit_uni_nms_kernel, public jit_generator { - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_nms_kernel_f32) - - explicit jit_uni_nms_kernel_f32(jit_nms_config_params jcp_) : jit_uni_nms_kernel(jcp_), jit_generator(jit_name()) {} - - void create_ker() override { - jit_generator::create_kernel(); - ker_ = (decltype(ker_))jit_ker(); - } - - void generate() override { - load_vector_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, vector_step)); - load_scalar_emitter.reset(new jit_load_emitter(this, isa, Precision::FP32, Precision::FP32, scalar_step)); - - exp_injector.reset(new jit_uni_eltwise_injector_f32(this, dnnl::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); - - this->preamble(); - - uni_vpxor(vmm_zero, vmm_zero, vmm_zero); - - load_pool_gpr_idxs = {static_cast(reg_load_store_mask.getIdx()), static_cast(reg_load_table.getIdx())}; - store_pool_gpr_idxs = {static_cast(reg_load_store_mask.getIdx())}; - store_pool_vec_idxs = {static_cast(vmm_zero.getIdx())}; - - mov(reg_boxes_coord0, ptr[reg_params + GET_OFF(selected_boxes_coord[0])]); - mov(reg_boxes_coord1, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 1 * sizeof(size_t)]); - mov(reg_boxes_coord2, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 2 * sizeof(size_t)]); - mov(reg_boxes_coord3, ptr[reg_params + GET_OFF(selected_boxes_coord[0]) + 3 * sizeof(size_t)]); - mov(reg_candidate_box, ptr[reg_params + GET_OFF(candidate_box)]); - mov(reg_candidate_status, ptr[reg_params + GET_OFF(candidate_status)]); - mov(reg_boxes_num, ptr[reg_params + GET_OFF(selected_boxes_num)]); - mov(reg_iou_threshold, ptr[reg_params + GET_OFF(iou_threshold)]); - // soft - mov(reg_score_threshold, ptr[reg_params + GET_OFF(score_threshold)]); - mov(reg_score, ptr[reg_params + GET_OFF(score)]); - mov(reg_scale, ptr[reg_params + GET_OFF(scale)]); - - // could use rcx(reg_table) and rdi(reg_temp) now as abi parse finished - mov(reg_table, l_table_constant); - if (mayiuse(cpu::x64::avx512_core)) { - kmovw(k_mask_one, word[reg_table + vlen]); - } - uni_vbroadcastss(vmm_iou_threshold, ptr[reg_iou_threshold]); - uni_vbroadcastss(vmm_score_threshold, ptr[reg_score_threshold]); - - uni_vbroadcastss(vmm_candidate_coord0, ptr[reg_candidate_box]); - uni_vbroadcastss(vmm_candidate_coord1, ptr[reg_candidate_box + 1 * sizeof(float)]); - uni_vbroadcastss(vmm_candidate_coord2, ptr[reg_candidate_box + 2 * sizeof(float)]); - uni_vbroadcastss(vmm_candidate_coord3, ptr[reg_candidate_box + 3 * sizeof(float)]); - - if (jcp.box_encode_type == NMSBoxEncodeType::CORNER) { - // box format: y1, x1, y2, x2 - uni_vminps(vmm_temp1, vmm_candidate_coord0, vmm_candidate_coord2); - uni_vmaxps(vmm_temp2, vmm_candidate_coord0, vmm_candidate_coord2); - uni_vmovups(vmm_candidate_coord0, vmm_temp1); - uni_vmovups(vmm_candidate_coord2, vmm_temp2); - - uni_vminps(vmm_temp1, vmm_candidate_coord1, vmm_candidate_coord3); - uni_vmaxps(vmm_temp2, vmm_candidate_coord1, vmm_candidate_coord3); - uni_vmovups(vmm_candidate_coord1, vmm_temp1); - uni_vmovups(vmm_candidate_coord3, vmm_temp2); - } else { - // box format: x_center, y_center, width, height --> y1, x1, y2, x2 - uni_vmulps(vmm_temp1, vmm_candidate_coord2, ptr[reg_table]); // width/2 - uni_vmulps(vmm_temp2, vmm_candidate_coord3, ptr[reg_table]); // height/2 - - uni_vaddps(vmm_temp3, vmm_candidate_coord0, vmm_temp1); // x_center + width/2 - uni_vmovups(vmm_candidate_coord3, vmm_temp3); - - uni_vaddps(vmm_temp3, vmm_candidate_coord1, vmm_temp2); // y_center + height/2 - uni_vmovups(vmm_candidate_coord2, vmm_temp3); - - uni_vsubps(vmm_temp3, vmm_candidate_coord0, vmm_temp1); // x_center - width/2 - uni_vsubps(vmm_temp4, vmm_candidate_coord1, vmm_temp2); // y_center - height/2 - - uni_vmovups(vmm_candidate_coord1, vmm_temp3); - uni_vmovups(vmm_candidate_coord0, vmm_temp4); - } - - // check from last to first - imul(reg_temp_64, reg_boxes_num, sizeof(float)); - add(reg_boxes_coord0, reg_temp_64); // y1 - add(reg_boxes_coord1, reg_temp_64); // x1 - add(reg_boxes_coord2, reg_temp_64); // y2 - add(reg_boxes_coord3, reg_temp_64); // x2 - - Xbyak::Label hard_nms_label; - Xbyak::Label nms_end_label; - - mov(reg_temp_32, ptr[reg_scale]); - test(reg_temp_32, reg_temp_32); - jz(hard_nms_label, T_NEAR); - - soft_nms(); - - jmp(nms_end_label, T_NEAR); - - L(hard_nms_label); - - hard_nms(); - - L(nms_end_label); - - this->postamble(); - - load_vector_emitter->emit_data(); - load_scalar_emitter->emit_data(); - - prepare_table(); - exp_injector->prepare_table(); - } - -private: - using Vmm = typename conditional3::type; - uint32_t vlen = cpu_isa_traits::vlen; - const int vector_step = vlen / sizeof(float); - const int scalar_step = 1; - - Xbyak::Reg64 reg_boxes_coord0 = r8; - Xbyak::Reg64 reg_boxes_coord1 = r9; - Xbyak::Reg64 reg_boxes_coord2 = r10; - Xbyak::Reg64 reg_boxes_coord3 = r11; - Xbyak::Reg64 reg_candidate_box = r12; - Xbyak::Reg64 reg_candidate_status = r13; - Xbyak::Reg64 reg_boxes_num = r14; - Xbyak::Reg64 reg_iou_threshold = r15; - // more for soft - Xbyak::Reg64 reg_score_threshold = rdx; - Xbyak::Reg64 reg_score = rbp; - Xbyak::Reg64 reg_scale = rsi; - - Xbyak::Reg64 reg_load_table = rax; - Xbyak::Reg64 reg_load_store_mask = rbx; - - // reuse - Xbyak::Label l_table_constant; - Xbyak::Reg64 reg_table = rcx; - Xbyak::Reg64 reg_temp_64 = rdi; - Xbyak::Reg32 reg_temp_32 = edi; - - Xbyak::Reg64 reg_params = abi_param1; - - std::unique_ptr load_vector_emitter = nullptr; - std::unique_ptr load_scalar_emitter = nullptr; - - std::vector store_pool_gpr_idxs; - std::vector store_pool_vec_idxs; - std::vector load_pool_gpr_idxs; - - Vmm vmm_boxes_coord0 = Vmm(1); - Vmm vmm_boxes_coord1 = Vmm(2); - Vmm vmm_boxes_coord2 = Vmm(3); - Vmm vmm_boxes_coord3 = Vmm(4); - Vmm vmm_candidate_coord0 = Vmm(5); - Vmm vmm_candidate_coord1 = Vmm(6); - Vmm vmm_candidate_coord2 = Vmm(7); - Vmm vmm_candidate_coord3 = Vmm(8); - Vmm vmm_temp1 = Vmm(9); - Vmm vmm_temp2 = Vmm(10); - Vmm vmm_temp3 = Vmm(11); - Vmm vmm_temp4 = Vmm(12); - - Vmm vmm_iou_threshold = Vmm(13); - Vmm vmm_zero = Vmm(15); - - // soft - Vmm vmm_score_threshold = Vmm(14); - Vmm vmm_scale = Vmm(0); - - Xbyak::Opmask k_mask = Xbyak::Opmask(7); - Xbyak::Opmask k_mask_one = Xbyak::Opmask(6); - - std::shared_ptr> exp_injector; - - inline void hard_nms() { - Xbyak::Label main_loop_label_hard; - Xbyak::Label main_loop_end_label_hard; - Xbyak::Label tail_loop_label_hard; - Xbyak::Label terminate_label_hard; - L(main_loop_label_hard); - { - cmp(reg_boxes_num, vector_step); - jl(main_loop_end_label_hard, T_NEAR); - - sub(reg_boxes_coord0, vector_step * sizeof(float)); - sub(reg_boxes_coord1, vector_step * sizeof(float)); - sub(reg_boxes_coord2, vector_step * sizeof(float)); - sub(reg_boxes_coord3, vector_step * sizeof(float)); - - // iou result is in vmm_temp3 - iou(vector_step); - - sub(reg_boxes_num, vector_step); - - suppressed_by_iou(false); - - // if zero continue, else set result to suppressed and terminate - jz(main_loop_label_hard, T_NEAR); - - uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0); - - jmp(terminate_label_hard, T_NEAR); - } - L(main_loop_end_label_hard); - - L(tail_loop_label_hard); - { - cmp(reg_boxes_num, 1); - jl(terminate_label_hard, T_NEAR); - - sub(reg_boxes_coord0, scalar_step * sizeof(float)); - sub(reg_boxes_coord1, scalar_step * sizeof(float)); - sub(reg_boxes_coord2, scalar_step * sizeof(float)); - sub(reg_boxes_coord3, scalar_step * sizeof(float)); - - // iou result is in vmm_temp3 - iou(scalar_step); - - sub(reg_boxes_num, scalar_step); - - suppressed_by_iou(true); - - jz(tail_loop_label_hard, T_NEAR); - - uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0); - - jmp(terminate_label_hard, T_NEAR); - } - - L(terminate_label_hard); - } - - inline void soft_nms() { - uni_vbroadcastss(vmm_scale, ptr[reg_scale]); - - Xbyak::Label main_loop_label; - Xbyak::Label main_loop_end_label; - Xbyak::Label tail_loop_label; - Xbyak::Label terminate_label; - - Xbyak::Label main_loop_label_soft; - Xbyak::Label tail_loop_label_soft; - L(main_loop_label); - { - cmp(reg_boxes_num, vector_step); - jl(main_loop_end_label, T_NEAR); - - sub(reg_boxes_coord0, vector_step * sizeof(float)); - sub(reg_boxes_coord1, vector_step * sizeof(float)); - sub(reg_boxes_coord2, vector_step * sizeof(float)); - sub(reg_boxes_coord3, vector_step * sizeof(float)); - - // result(iou and weight) is in vmm_temp3 - iou(vector_step); - sub(reg_boxes_num, vector_step); - - // soft suppressed by iou_threshold - if (jcp.is_soft_suppressed_by_iou) { - suppressed_by_iou(false); - - // if zero continue soft suppression, else set result to suppressed and terminate - jz(main_loop_label_soft, T_NEAR); - - uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0); - - jmp(terminate_label, T_NEAR); - - L(main_loop_label_soft); - } - - // weight: std::exp(scale * iou * iou) - soft_coeff(); - - // vector weights multiply - horizontal_mul(); - - uni_vbroadcastss(vmm_temp1, ptr[reg_score]); - - // new score in vmm3[0] - uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1); - // store new score - uni_vmovss(ptr[reg_score], vmm_temp3); - - // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold - suppressed_by_score(); - - jz(main_loop_label, T_NEAR); - - uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0); - - jmp(terminate_label, T_NEAR); - } - L(main_loop_end_label); - - L(tail_loop_label); - { - cmp(reg_boxes_num, 1); - jl(terminate_label, T_NEAR); - - sub(reg_boxes_coord0, scalar_step * sizeof(float)); - sub(reg_boxes_coord1, scalar_step * sizeof(float)); - sub(reg_boxes_coord2, scalar_step * sizeof(float)); - sub(reg_boxes_coord3, scalar_step * sizeof(float)); - - iou(scalar_step); - sub(reg_boxes_num, scalar_step); - - // soft suppressed by iou_threshold - if (jcp.is_soft_suppressed_by_iou) { - suppressed_by_iou(true); - - jz(tail_loop_label_soft, T_NEAR); - - uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0); - - jmp(terminate_label, T_NEAR); - - L(tail_loop_label_soft); - } - - soft_coeff(); - - uni_vbroadcastss(vmm_temp1, ptr[reg_score]); - - // vmm3[0] is valide, no need horizontal mul. - uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp1); - - uni_vmovss(ptr[reg_score], vmm_temp3); - - // cmpps(_CMP_LE_OS) if new score is less or equal than score_threshold - suppressed_by_score(); - - jz(tail_loop_label, T_NEAR); - - uni_vpextrd(ptr[reg_candidate_status], Xmm(vmm_zero.getIdx()), 0); - - jmp(terminate_label, T_NEAR); - } - - L(terminate_label); - } - - inline void suppressed_by_iou(bool is_scalar) { - if (mayiuse(cpu::x64::avx512_core)) { - vcmpps(k_mask, vmm_temp3, vmm_iou_threshold, 0x0D); // _CMP_GE_OS. vcmpps w/ kmask only on V5 - if (is_scalar) - kandw(k_mask, k_mask, k_mask_one); - kortestw(k_mask, k_mask); // bitwise check if all zero - } else if (mayiuse(cpu::x64::avx)) { - // vex instructions with xmm on avx and ymm on avx2 - vcmpps(vmm_temp4, vmm_temp3, vmm_iou_threshold, 0x0D); // xmm and ymm only on V1. - if (is_scalar) { - uni_vpextrd(reg_temp_32, Xmm(vmm_temp4.getIdx()), 0); - test(reg_temp_32, reg_temp_32); - } else { - uni_vtestps(vmm_temp4, vmm_temp4); // vtestps: sign bit check if all zeros, ymm and xmm only on V1, N/A on V5 - } - } else { - // pure sse path, make sure don't spoil vmm_temp3, which may used in after soft-suppression - uni_vmovups(vmm_temp4, vmm_temp3); - cmpps(vmm_temp4, vmm_iou_threshold, 0x07); // order compare, 0 for at least one is NaN - - uni_vmovups(vmm_temp2, vmm_temp3); - cmpps(vmm_temp2, vmm_iou_threshold, 0x05); // _CMP_GE_US on sse, no direct _CMP_GE_OS supported. - - uni_vandps(vmm_temp4, vmm_temp4, vmm_temp2); - if (is_scalar) { - uni_vpextrd(reg_temp_32, Xmm(vmm_temp4.getIdx()), 0); - test(reg_temp_32, reg_temp_32); - } else { - uni_vtestps(vmm_temp4, vmm_temp4); // ptest: bitwise check if all zeros, on sse41 - } - } - } - - inline void suppressed_by_score() { - if (mayiuse(cpu::x64::avx512_core)) { - vcmpps(k_mask, vmm_temp3, vmm_score_threshold, 0x02); // vcmpps w/ kmask only on V5, w/o kmask version N/A on V5 - kandw(k_mask, k_mask, k_mask_one); - kortestw(k_mask, k_mask); // bitwise check if all zero - } else if (mayiuse(cpu::x64::avx)) { - vcmpps(vmm_temp4, vmm_temp3, vmm_score_threshold, 0x02); - uni_vpextrd(reg_temp_32, Xmm(vmm_temp4.getIdx()), 0); - test(reg_temp_32, reg_temp_32); - } else { - cmpps(vmm_temp3, vmm_score_threshold, 0x02); // _CMP_LE_OS on sse - uni_vpextrd(reg_temp_32, Xmm(vmm_temp3.getIdx()), 0); - test(reg_temp_32, reg_temp_32); - } - } - - inline void iou(int ele_num) { - auto load = [&](Xbyak::Reg64 reg_src, Vmm vmm_dst) { - if (ele_num != scalar_step && ele_num != vector_step) - IE_THROW() << "NMS JIT implementation supports load emitter with only element count scalar_step or vector_step! Get: " << ele_num; - - const auto& load_emitter = ele_num == 1 ? load_scalar_emitter : load_vector_emitter; - load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_dst.getIdx())}, - {}, {load_pool_gpr_idxs}); - }; - load(reg_boxes_coord0, vmm_boxes_coord0); - load(reg_boxes_coord1, vmm_boxes_coord1); - load(reg_boxes_coord2, vmm_boxes_coord2); - load(reg_boxes_coord3, vmm_boxes_coord3); - - if (jcp.box_encode_type == NMSBoxEncodeType::CORNER) { - // box format: y1, x1, y2, x2 - uni_vminps(vmm_temp1, vmm_boxes_coord0, vmm_boxes_coord2); - uni_vmaxps(vmm_temp2, vmm_boxes_coord0, vmm_boxes_coord2); - uni_vmovups(vmm_boxes_coord0, vmm_temp1); - uni_vmovups(vmm_boxes_coord2, vmm_temp2); - - uni_vminps(vmm_temp1, vmm_boxes_coord1, vmm_boxes_coord3); - uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_boxes_coord3); - uni_vmovups(vmm_boxes_coord1, vmm_temp1); - uni_vmovups(vmm_boxes_coord3, vmm_temp2); - } else { - // box format: x_center, y_center, width, height --> y1, x1, y2, x2 - uni_vmulps(vmm_temp1, vmm_boxes_coord2, ptr[reg_table]); // width/2 - uni_vmulps(vmm_temp2, vmm_boxes_coord3, ptr[reg_table]); // height/2 - - uni_vaddps(vmm_temp3, vmm_boxes_coord0, vmm_temp1); // x_center + width/2 - uni_vmovups(vmm_boxes_coord3, vmm_temp3); - - uni_vaddps(vmm_temp3, vmm_boxes_coord1, vmm_temp2); // y_center + height/2 - uni_vmovups(vmm_boxes_coord2, vmm_temp3); - - uni_vsubps(vmm_temp3, vmm_boxes_coord0, vmm_temp1); // x_center - width/2 - uni_vsubps(vmm_temp4, vmm_boxes_coord1, vmm_temp2); // y_center - height/2 - - uni_vmovups(vmm_boxes_coord1, vmm_temp3); - uni_vmovups(vmm_boxes_coord0, vmm_temp4); - } - - uni_vsubps(vmm_temp1, vmm_boxes_coord2, vmm_boxes_coord0); - uni_vsubps(vmm_temp2, vmm_boxes_coord3, vmm_boxes_coord1); - uni_vmulps(vmm_temp1, vmm_temp1, vmm_temp2); // boxes area - - uni_vsubps(vmm_temp2, vmm_candidate_coord2, vmm_candidate_coord0); - uni_vsubps(vmm_temp3, vmm_candidate_coord3, vmm_candidate_coord1); - uni_vmulps(vmm_temp2, vmm_temp2, vmm_temp3); // candidate(bc) area // candidate area calculate once and check if 0 - - uni_vaddps(vmm_temp1, vmm_temp1, vmm_temp2); // areaI + areaJ to free vmm_temp2 - - // y of intersection - uni_vminps(vmm_temp3, vmm_boxes_coord2, vmm_candidate_coord2); // min(Ymax) - uni_vmaxps(vmm_temp4, vmm_boxes_coord0, vmm_candidate_coord0); // max(Ymin) - uni_vsubps(vmm_temp3, vmm_temp3, vmm_temp4); // min(Ymax) - max(Ymin) - uni_vmaxps(vmm_temp3, vmm_temp3, vmm_zero); - - // x of intersection - uni_vminps(vmm_temp4, vmm_boxes_coord3, vmm_candidate_coord3); // min(Xmax) - uni_vmaxps(vmm_temp2, vmm_boxes_coord1, vmm_candidate_coord1); // max(Xmin) - uni_vsubps(vmm_temp4, vmm_temp4, vmm_temp2); // min(Xmax) - max(Xmin) - uni_vmaxps(vmm_temp4, vmm_temp4, vmm_zero); - - // intersection_area - uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp4); - - // iou: intersection_area / (areaI + areaJ - intersection_area); - uni_vsubps(vmm_temp1, vmm_temp1, vmm_temp3); - uni_vdivps(vmm_temp3, vmm_temp3, vmm_temp1); - } - - // std::exp(scale * iou * iou) - inline void soft_coeff() { - uni_vmulps(vmm_temp3, vmm_temp3, vmm_temp3); - uni_vmulps(vmm_temp3, vmm_temp3, vmm_scale); - exp_injector->compute_vector_range(vmm_temp3.getIdx(), vmm_temp3.getIdx() + 1); - } - - inline void horizontal_mul_xmm(const Xbyak::Xmm &xmm_weight, const Xbyak::Xmm &xmm_aux) { - uni_vmovshdup(xmm_aux, xmm_weight); // weight:1,2,3,4; aux:2,2,4,4 - uni_vmulps(xmm_weight, xmm_weight, xmm_aux); // weight:1*2,2*2,3*4,4*4 - uni_vmovhlps(xmm_aux, xmm_aux, xmm_weight); // aux:3*4,4*4,4,4 - uni_vmulps(xmm_weight, xmm_weight, xmm_aux); // weight:1*2*3*4,... - } - - // horizontal mul for vmm_weight(Vmm(3)), temp1 and temp2 as aux - inline void horizontal_mul() { - Xbyak::Xmm xmm_weight = Xbyak::Xmm(vmm_temp3.getIdx()); - Xbyak::Xmm xmm_temp1 = Xbyak::Xmm(vmm_temp1.getIdx()); - Xbyak::Xmm xmm_temp2 = Xbyak::Xmm(vmm_temp2.getIdx()); - if (isa == cpu::x64::sse41) { - horizontal_mul_xmm(xmm_weight, xmm_temp1); - } else if (isa == cpu::x64::avx2) { - Xbyak::Ymm ymm_weight = Xbyak::Ymm(vmm_temp3.getIdx()); - vextractf128(xmm_temp1, ymm_weight, 0); - vextractf128(xmm_temp2, ymm_weight, 1); - uni_vmulps(xmm_weight, xmm_temp1, xmm_temp2); - horizontal_mul_xmm(xmm_weight, xmm_temp1); - } else { - Xbyak::Zmm zmm_weight = Xbyak::Zmm(vmm_temp3.getIdx()); - vextractf32x4(xmm_temp1, zmm_weight, 0); - vextractf32x4(xmm_temp2, zmm_weight, 1); - uni_vmulps(xmm_temp1, xmm_temp1, xmm_temp2); - vextractf32x4(xmm_temp2, zmm_weight, 2); - vextractf32x4(xmm_weight, zmm_weight, 3); - uni_vmulps(xmm_weight, xmm_weight, xmm_temp2); - uni_vmulps(xmm_weight, xmm_weight, xmm_temp1); - horizontal_mul_xmm(xmm_weight, xmm_temp1); - } - } - - inline void prepare_table() { - auto broadcast_d = [&](int val) { - for (size_t d = 0; d < vlen / sizeof(int); ++d) { - dd(val); - } - }; - - align(64); - L(l_table_constant); - broadcast_d(0x3f000000); // 0.5f - dw(0x0001); - } -}; -#endif - -bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { +bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - using NonMaxSuppressionV9 = ngraph::op::v9::NonMaxSuppression; - if (!one_of(op->get_type_info(), NonMaxSuppressionV9::get_type_info_static(), - ov::op::internal::NonMaxSuppressionIEInternal::get_type_info_static())) { - errorMessage = "Only NonMaxSuppression v9 and NonMaxSuppressionIEInternal are supported"; + if (!one_of(op->get_type_info(), op::v9::NonMaxSuppression::get_type_info_static(), + op::internal::NonMaxSuppressionIEInternal::get_type_info_static(), + op::v13::NMSRotated::get_type_info_static())) { + errorMessage = "Only NonMaxSuppression from opset9, NonMaxSuppressionIEInternal and NMSRotated from opset13 are supported."; return false; } - if (const auto nms9 = std::dynamic_pointer_cast(op)) { + if (auto nms9 = as_type(op.get())) { const auto boxEncoding = nms9->get_box_encoding(); - if (!one_of(boxEncoding, NonMaxSuppressionV9::BoxEncodingType::CENTER, NonMaxSuppressionV9::BoxEncodingType::CORNER)) { + if (!one_of(boxEncoding, op::v9::NonMaxSuppression::BoxEncodingType::CENTER, op::v9::NonMaxSuppression::BoxEncodingType::CORNER)) { errorMessage = "Supports only CENTER and CORNER box encoding type"; return false; } @@ -576,107 +46,125 @@ bool NonMaxSuppression::isSupportedOperation(const std::shared_ptr& op, const GraphContext::CPtr context) - : Node(op, context, InternalDynShapeInferFactory()), - isSoftSuppressedByIOU(false) { +NonMaxSuppression::NonMaxSuppression(const std::shared_ptr& op, const GraphContext::CPtr& context) + : Node(op, context, InternalDynShapeInferFactory()), + m_is_soft_suppressed_by_iou(false) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { - IE_THROW(NotImplemented) << errorMessage; + OPENVINO_THROW(errorMessage); } - errorPrefix = "NMS layer with name '" + op->get_friendly_name() + "' "; - if (one_of(op->get_type_info(), ov::op::internal::NonMaxSuppressionIEInternal::get_type_info_static())) - m_outStaticShape = true; + if (one_of(op->get_type_info(), op::internal::NonMaxSuppressionIEInternal::get_type_info_static())) { + m_out_static_shape = true; + } - if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > 6) - IE_THROW() << errorPrefix << "has incorrect number of input edges: " << getOriginalInputsNumber(); + if (getOriginalInputsNumber() < 2 || getOriginalInputsNumber() > NMS_SOFT_NMS_SIGMA + 1) { + THROW_CPU_NODE_ERR("has incorrect number of input edges: ", getOriginalInputsNumber()); + } + if (getOriginalOutputsNumber() != 3) { + THROW_CPU_NODE_ERR("has incorrect number of output edges: ", getOriginalOutputsNumber()); + } - if (getOriginalOutputsNumber() != 3) - IE_THROW() << errorPrefix << "has incorrect number of output edges: " << getOriginalOutputsNumber(); - - if (const auto nms9 = std::dynamic_pointer_cast(op)) { + if (auto nms9 = as_type(op.get())) { boxEncodingType = static_cast(nms9->get_box_encoding()); - sortResultDescending = nms9->get_sort_result_descending(); - } else if (const auto nmsIe = std::dynamic_pointer_cast(op)) { - boxEncodingType = nmsIe->m_center_point_box ? NMSBoxEncodeType::CENTER : NMSBoxEncodeType::CORNER; - sortResultDescending = nmsIe->m_sort_result_descending; - } else { - const auto &typeInfo = op->get_type_info(); - IE_THROW() << errorPrefix << " doesn't support NMS: " << typeInfo.name << " v" << typeInfo.version_id; - } + m_sort_result_descending = nms9->get_sort_result_descending(); + m_coord_num = 4lu; + } else if (auto nmsIe = as_type(op.get())) { + boxEncodingType = nmsIe->m_center_point_box ? NMSBoxEncodeType::CENTER : NMSBoxEncodeType::CORNER; + m_sort_result_descending = nmsIe->m_sort_result_descending; + m_coord_num = 4lu; + } else if (auto nms = as_type(op.get())) { + m_sort_result_descending = nms->get_sort_result_descending(); + m_clockwise = nms->get_clockwise(); + m_rotated_boxes = true; + m_coord_num = 5lu; + } else { + const auto &typeInfo = op->get_type_info(); + THROW_CPU_NODE_ERR("doesn't support NMS: ", typeInfo.name, " v", typeInfo.version_id); + } - const auto &boxes_dims = getInputShapeAtPort(NMS_BOXES).getDims(); - if (boxes_dims.size() != 3) - IE_THROW() << errorPrefix << "has unsupported 'boxes' input rank: " << boxes_dims.size(); - if (boxes_dims[2] != 4) - IE_THROW() << errorPrefix << "has unsupported 'boxes' input 3rd dimension size: " << boxes_dims[2]; + const auto &boxes_dims = getInputShapeAtPort(NMS_BOXES).getDims(); + if (boxes_dims.size() != 3) { + THROW_CPU_NODE_ERR("has unsupported 'boxes' input rank: ", boxes_dims.size()); + } + if (boxes_dims[2] != m_coord_num) { + THROW_CPU_NODE_ERR("has unsupported 'boxes' input 3rd dimension size: ", boxes_dims[2]); + } - const auto &scores_dims = getInputShapeAtPort(NMS_SCORES).getDims(); - if (scores_dims.size() != 3) - IE_THROW() << errorPrefix << "has unsupported 'scores' input rank: " << scores_dims.size(); + const auto &scores_dims = getInputShapeAtPort(NMS_SCORES).getDims(); + if (scores_dims.size() != 3) { + THROW_CPU_NODE_ERR("has unsupported 'scores' input rank: ", scores_dims.size()); + } - const Shape valid_outputs_shape = getOutputShapeAtPort(NMS_VALIDOUTPUTS); - if (valid_outputs_shape.getRank() != 1) - IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output rank: " << valid_outputs_shape.getRank(); - if (valid_outputs_shape.getDims()[0] != 1) - IE_THROW() << errorPrefix << "has unsupported 'valid_outputs' output 1st dimension size: " << valid_outputs_shape.getDims()[1]; + const auto& valid_outputs_shape = getOutputShapeAtPort(NMS_VALID_OUTPUTS); + if (valid_outputs_shape.getRank() != 1) { + THROW_CPU_NODE_ERR("has unsupported 'valid_outputs' output rank: ", valid_outputs_shape.getRank()); + } + if (valid_outputs_shape.getDims()[0] != 1) { + THROW_CPU_NODE_ERR("has unsupported 'valid_outputs' output 1st dimension size: ", valid_outputs_shape.getDims()[1]); + } + + for (size_t i = 0lu; i < op->get_output_size(); i++) { + m_defined_outputs[i] = !op->get_output_target_inputs(i).empty(); + } } void NonMaxSuppression::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; - const std::vector supportedFloatPrecision = {Precision::FP32, Precision::BF16, Precision::FP16}; - const std::vector supportedIntOutputPrecision = {Precision::I32, Precision::I64}; + const auto inputs_num = inputShapes.size(); + if (inputs_num > NMS_MAX_OUTPUT_BOXES_PER_CLASS) { + check1DInput(getInputShapeAtPort(NMS_MAX_OUTPUT_BOXES_PER_CLASS), "max_output_boxes_per_class", NMS_MAX_OUTPUT_BOXES_PER_CLASS); + } + if (inputs_num > NMS_IOU_THRESHOLD) { + check1DInput(getInputShapeAtPort(NMS_IOU_THRESHOLD), "iou_threshold", NMS_IOU_THRESHOLD); + } + if (inputs_num > NMS_SCORE_THRESHOLD) { + check1DInput(getInputShapeAtPort(NMS_SCORE_THRESHOLD), "score_threshold", NMS_SCORE_THRESHOLD); + } + if (inputs_num > NMS_SOFT_NMS_SIGMA) { + check1DInput(getInputShapeAtPort(NMS_SCORE_THRESHOLD), "soft_nms_sigma", NMS_SCORE_THRESHOLD); + } - checkPrecision(getOriginalInputPrecisionAtPort(NMS_BOXES), supportedFloatPrecision, "boxes", inType); - checkPrecision(getOriginalInputPrecisionAtPort(NMS_SCORES), supportedFloatPrecision, "scores", inType); - checkPrecision(getOriginalOutputPrecisionAtPort(NMS_VALIDOUTPUTS), supportedIntOutputPrecision, "valid_outputs", outType); - - const std::vector supportedPrecision = {Precision::I16, Precision::U8, Precision::I8, Precision::U16, Precision::I32, - Precision::U32, Precision::I64, Precision::U64}; - - if (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS) - check1DInput(getInputShapeAtPort(NMS_MAXOUTPUTBOXESPERCLASS), supportedPrecision, "max_output_boxes_per_class", NMS_MAXOUTPUTBOXESPERCLASS); - if (inputShapes.size() > NMS_IOUTHRESHOLD) - check1DInput(getInputShapeAtPort(NMS_IOUTHRESHOLD), supportedFloatPrecision, "iou_threshold", NMS_IOUTHRESHOLD); - if (inputShapes.size() > NMS_SCORETHRESHOLD) - check1DInput(getInputShapeAtPort(NMS_SCORETHRESHOLD), supportedFloatPrecision, "score_threshold", NMS_SCORETHRESHOLD); - if (inputShapes.size() > NMS_SOFTNMSSIGMA) - check1DInput(getInputShapeAtPort(NMS_SCORETHRESHOLD), supportedFloatPrecision, "soft_nms_sigma", NMS_SCORETHRESHOLD); - - checkOutput(getOutputShapeAtPort(NMS_SELECTEDINDICES), supportedIntOutputPrecision, "selected_indices", NMS_SELECTEDINDICES); - checkOutput(getOutputShapeAtPort(NMS_SELECTEDSCORES), supportedFloatPrecision, "selected_scores", NMS_SELECTEDSCORES); + checkOutput(getOutputShapeAtPort(NMS_SELECTED_INDICES), "selected_indices", NMS_SELECTED_INDICES); + checkOutput(getOutputShapeAtPort(NMS_SELECTED_SCORES), "selected_scores", NMS_SELECTED_SCORES); std::vector inDataConf; - inDataConf.reserve(inputShapes.size()); - for (size_t i = 0; i < inputShapes.size(); ++i) { - Precision inPrecision = i == NMS_MAXOUTPUTBOXESPERCLASS ? Precision::I32 : Precision::FP32; + inDataConf.reserve(inputs_num); + for (size_t i = 0; i < inputs_num; ++i) { + Precision inPrecision = i == NMS_MAX_OUTPUT_BOXES_PER_CLASS ? Precision::I32 : Precision::FP32; inDataConf.emplace_back(LayoutType::ncsp, inPrecision); } std::vector outDataConf; outDataConf.reserve(outputShapes.size()); for (size_t i = 0; i < outputShapes.size(); ++i) { - Precision outPrecision = i == NMS_SELECTEDSCORES ? Precision::FP32 : Precision::I32; + Precision outPrecision = i == NMS_SELECTED_SCORES ? Precision::FP32 : Precision::I32; outDataConf.emplace_back(LayoutType::ncsp, outPrecision); } - impl_desc_type impl_type; - if (mayiuse(cpu::x64::avx512_core)) { - impl_type = impl_desc_type::jit_avx512; - } else if (mayiuse(cpu::x64::avx2)) { - impl_type = impl_desc_type::jit_avx2; - } else if (mayiuse(cpu::x64::sse41)) { - impl_type = impl_desc_type::jit_sse42; - } else { - impl_type = impl_desc_type::ref; + impl_desc_type impl_type = impl_desc_type::ref; + +#if defined(OPENVINO_ARCH_X86_64) + using namespace dnnl::impl::cpu; + + // As only FP32 and ncsp is supported, and kernel is shape agnostic, we can create here. There is no need to recompilation. + createJitKernel(); + + x64::cpu_isa_t actual_isa = x64::isa_undef; + if (m_jit_kernel) { + actual_isa = m_jit_kernel->getIsa(); } + switch (actual_isa) { + case x64::avx512_core: impl_type = impl_desc_type::jit_avx512; break; + case x64::avx2: impl_type = impl_desc_type::jit_avx2; break; + case x64::sse41: impl_type = impl_desc_type::jit_sse42; break; + default: impl_type = impl_desc_type::ref; + } +#endif // OPENVINO_ARCH_X86_64 addSupportedPrimDesc(inDataConf, outDataConf, impl_type); - - // as only FP32 and ncsp is supported, and kernel is shape agnostic, we can create here. There is no need to recompilation. - createJitKernel(); } void NonMaxSuppression::prepareParams() { @@ -685,156 +173,690 @@ void NonMaxSuppression::prepareParams() { const auto& scoresDims = isDynamicNode() ? getParentEdgesAtPort(NMS_SCORES)[0]->getMemory().getStaticDims() : getInputShapeAtPort(NMS_SCORES).getStaticDims(); - numBatches = boxesDims[0]; - numBoxes = boxesDims[1]; - numClasses = scoresDims[1]; - if (numBatches != scoresDims[0]) - IE_THROW() << errorPrefix << " numBatches is different in 'boxes' and 'scores' inputs"; - if (numBoxes != scoresDims[2]) - IE_THROW() << errorPrefix << " numBoxes is different in 'boxes' and 'scores' inputs"; + m_batches_num = boxesDims[0]; + m_boxes_num = boxesDims[1]; + m_classes_num = scoresDims[1]; + if (m_batches_num != scoresDims[0]) { + THROW_CPU_NODE_ERR("Batches number is different in 'boxes' and 'scores' inputs"); + } + if (m_boxes_num != scoresDims[2]) { + THROW_CPU_NODE_ERR("Boxes number is different in 'boxes' and 'scores' inputs"); + } - numFiltBox.resize(numBatches); - for (auto & i : numFiltBox) - i.resize(numClasses); -} + m_output_boxes_per_class = std::min(m_max_output_boxes_per_class, m_boxes_num); + const auto max_number_of_boxes = m_output_boxes_per_class * m_batches_num * m_classes_num; + m_filtered_boxes.resize(max_number_of_boxes); -bool NonMaxSuppression::isExecutable() const { - return isDynamicNode() || Node::isExecutable(); + m_num_filtered_boxes.resize(m_batches_num); + for (auto & i : m_num_filtered_boxes) { + i.resize(m_classes_num); + } } void NonMaxSuppression::createJitKernel() { #if defined(OPENVINO_ARCH_X86_64) - auto jcp = jit_nms_config_params(); - jcp.box_encode_type = boxEncodingType; - jcp.is_soft_suppressed_by_iou = isSoftSuppressedByIOU; + if (!m_rotated_boxes) { + auto jcp = kernel::NmsCompileParams(); + jcp.box_encode_type = boxEncodingType; + jcp.is_soft_suppressed_by_iou = m_is_soft_suppressed_by_iou; - if (mayiuse(cpu::x64::avx512_core)) { - nms_kernel.reset(new jit_uni_nms_kernel_f32(jcp)); - } else if (mayiuse(cpu::x64::avx2)) { - nms_kernel.reset(new jit_uni_nms_kernel_f32(jcp)); - } else if (mayiuse(cpu::x64::sse41)) { - nms_kernel.reset(new jit_uni_nms_kernel_f32(jcp)); + m_jit_kernel = kernel::JitKernel::createInstance(jcp); } - - if (nms_kernel) - nms_kernel->create_ker(); -#endif +#endif // OPENVINO_ARCH_X86_64 } void NonMaxSuppression::executeDynamicImpl(dnnl::stream strm) { - if (hasEmptyInputTensors() || (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS && - reinterpret_cast(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->getData())[0] == 0)) { + if (hasEmptyInputTensors() || (inputShapes.size() > NMS_MAX_OUTPUT_BOXES_PER_CLASS && + reinterpret_cast(getParentEdgeAt(NMS_MAX_OUTPUT_BOXES_PER_CLASS)->getMemoryPtr()->getData())[0] == 0)) { redefineOutputMemory({{0, 3}, {0, 3}, {1}}); - *reinterpret_cast(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->getData()) = 0; + *reinterpret_cast(getChildEdgesAtPort(NMS_VALID_OUTPUTS)[0]->getMemoryPtr()->getData()) = 0; return; } execute(strm); } void NonMaxSuppression::execute(dnnl::stream strm) { - const float *boxes = reinterpret_cast(getParentEdgeAt(NMS_BOXES)->getMemoryPtr()->getData()); - const float *scores = reinterpret_cast(getParentEdgeAt(NMS_SCORES)->getMemoryPtr()->getData()); + const auto inputs_num = inputShapes.size(); - if (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS) { - maxOutputBoxesPerClass = reinterpret_cast(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->getData())[0]; + size_t max_number_of_boxes = m_output_boxes_per_class * m_batches_num * m_classes_num; + if (inputs_num > NMS_MAX_OUTPUT_BOXES_PER_CLASS) { + auto val = reinterpret_cast(getParentEdgeAt(NMS_MAX_OUTPUT_BOXES_PER_CLASS)->getMemoryPtr()->getData())[0]; + m_max_output_boxes_per_class = val <= 0l ? 0lu : static_cast(val); + m_output_boxes_per_class = std::min(m_max_output_boxes_per_class, m_boxes_num); + max_number_of_boxes = m_output_boxes_per_class * m_batches_num * m_classes_num; + m_filtered_boxes.resize(max_number_of_boxes); } - - maxOutputBoxesPerClass = std::min(maxOutputBoxesPerClass, numBoxes); - - if (maxOutputBoxesPerClass == 0) { + if (m_max_output_boxes_per_class == 0lu) { return; } - if (inputShapes.size() > NMS_IOUTHRESHOLD) - iouThreshold = reinterpret_cast(getParentEdgeAt(NMS_IOUTHRESHOLD)->getMemoryPtr()->getData())[0]; - - if (inputShapes.size() > NMS_SCORETHRESHOLD) - scoreThreshold = reinterpret_cast(getParentEdgeAt(NMS_SCORETHRESHOLD)->getMemoryPtr()->getData())[0]; - - if (inputShapes.size() > NMS_SOFTNMSSIGMA) - softNMSSigma = reinterpret_cast(getParentEdgeAt(NMS_SOFTNMSSIGMA)->getMemoryPtr()->getData())[0]; - scale = 0.0f; - if (softNMSSigma > 0.0) { - scale = -0.5f / softNMSSigma; + if (inputs_num > NMS_IOU_THRESHOLD) { + m_iou_threshold = reinterpret_cast(getParentEdgeAt(NMS_IOU_THRESHOLD)->getMemoryPtr()->getData())[0]; + } + if (inputs_num > NMS_SCORE_THRESHOLD) { + m_score_threshold = reinterpret_cast(getParentEdgeAt(NMS_SCORE_THRESHOLD)->getMemoryPtr()->getData())[0]; + } + if (inputs_num > NMS_SOFT_NMS_SIGMA) { + m_soft_nms_sigma = reinterpret_cast(getParentEdgeAt(NMS_SOFT_NMS_SIGMA)->getMemoryPtr()->getData())[0]; + m_scale = (m_soft_nms_sigma > 0.f) ? (-0.5f / m_soft_nms_sigma) : 0.f; } - auto boxesStrides = getParentEdgeAt(NMS_BOXES)->getMemory().getDescWithType()->getStrides(); - auto scoresStrides = getParentEdgeAt(NMS_SCORES)->getMemory().getDescWithType()->getStrides(); + auto boxes_memory = getParentEdgeAt(NMS_BOXES)->getMemoryPtr(); + auto scores_memory = getParentEdgeAt(NMS_SCORES)->getMemoryPtr(); - const auto maxNumberOfBoxes = maxOutputBoxesPerClass * numBatches * numClasses; - std::vector filtBoxes(maxNumberOfBoxes); + auto boxes = reinterpret_cast(boxes_memory->getData()); + auto scores = reinterpret_cast(scores_memory->getData()); - if (softNMSSigma == 0.0f) { - nmsWithoutSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + const auto& boxes_strides = boxes_memory->getDescWithType()->getStrides(); + const auto& scores_strides = scores_memory->getDescWithType()->getStrides(); + + if (m_rotated_boxes) { + nmsRotated(boxes, scores, boxes_strides, scores_strides, m_filtered_boxes); + } else if (m_soft_nms_sigma == 0.f) { + nmsWithoutSoftSigma(boxes, scores, boxes_strides, scores_strides, m_filtered_boxes); } else { - nmsWithSoftSigma(boxes, scores, boxesStrides, scoresStrides, filtBoxes); + nmsWithSoftSigma(boxes, scores, boxes_strides, scores_strides, m_filtered_boxes); } - size_t startOffset = numFiltBox[0][0]; - for (size_t b = 0; b < numFiltBox.size(); b++) { - size_t batchOffset = b*numClasses*maxOutputBoxesPerClass; - for (size_t c = (b == 0 ? 1 : 0); c < numFiltBox[b].size(); c++) { - size_t offset = batchOffset + c*maxOutputBoxesPerClass; - for (size_t i = 0; i < numFiltBox[b][c]; i++) { - filtBoxes[startOffset + i] = filtBoxes[offset + i]; + size_t start_offset = m_num_filtered_boxes[0][0]; + for (size_t b = 0lu; b < m_num_filtered_boxes.size(); b++) { + size_t batchOffset = b * m_classes_num * m_output_boxes_per_class; + for (size_t c = (b == 0lu ? 1lu : 0lu); c < m_num_filtered_boxes[b].size(); c++) { + size_t offset = batchOffset + c * m_output_boxes_per_class; + for (size_t i = 0lu; i < m_num_filtered_boxes[b][c]; i++) { + m_filtered_boxes[start_offset + i] = m_filtered_boxes[offset + i]; } - startOffset += numFiltBox[b][c]; + start_offset += m_num_filtered_boxes[b][c]; } } - filtBoxes.resize(startOffset); + auto boxes_ptr = m_filtered_boxes.data(); // need more particular comparator to get deterministic behaviour // escape situation when filtred boxes with same score have different position from launch to launch - if (sortResultDescending) { - parallel_sort(filtBoxes.begin(), filtBoxes.end(), - [](const filteredBoxes& l, const filteredBoxes& r) { + if (m_sort_result_descending) { + parallel_sort(boxes_ptr, boxes_ptr + start_offset, + [](const FilteredBox& l, const FilteredBox& r) { return (l.score > r.score) || - (l.score == r.score && l.batch_index < r.batch_index) || - (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) || - (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index); + (l.score == r.score && l.batch_index < r.batch_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index < r.class_index) || + (l.score == r.score && l.batch_index == r.batch_index && l.class_index == r.class_index && l.box_index < r.box_index); }); } - auto indicesMemPtr = getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr(); - auto scoresMemPtr = getChildEdgesAtPort(NMS_SELECTEDSCORES)[0]->getMemoryPtr(); - const size_t validOutputs = std::min(filtBoxes.size(), maxNumberOfBoxes); + const size_t valid_outputs = std::min(start_offset, max_number_of_boxes); - if (!m_outStaticShape) { - VectorDims newDims{validOutputs, 3}; - redefineOutputMemory({newDims, newDims, {1}}); + if (m_defined_outputs[NMS_SELECTED_INDICES]) { + const size_t stride = 3lu; + + if (!m_out_static_shape) { + redefineOutputMemory(NMS_SELECTED_INDICES, { valid_outputs, stride }); + } + + auto out_ptr = reinterpret_cast(getChildEdgesAtPort(NMS_SELECTED_INDICES)[0]->getMemoryPtr()->getData()); + int32_t* boxes_ptr = &(m_filtered_boxes[0].batch_index); + + size_t idx = 0lu; + for (; idx < valid_outputs; idx++) { + memcpy(out_ptr, boxes_ptr, 12); + out_ptr += stride; + boxes_ptr += 4; + } + + if (m_out_static_shape) { + std::fill(out_ptr, out_ptr + (max_number_of_boxes - idx) * stride, -1); + } } - int selectedIndicesStride = indicesMemPtr->getDescWithType()->getStrides()[0]; + if (m_defined_outputs[NMS_SELECTED_SCORES]) { + const size_t stride = 3lu; - int *selectedIndicesPtr = reinterpret_cast(indicesMemPtr->getData()); - float *selectedScoresPtr = reinterpret_cast(scoresMemPtr->getData()); + if (!m_out_static_shape) { + redefineOutputMemory(NMS_SELECTED_SCORES, { valid_outputs, stride }); + } - size_t idx = 0lu; - for (; idx < validOutputs; idx++) { - selectedIndicesPtr[0] = filtBoxes[idx].batch_index; - selectedIndicesPtr[1] = filtBoxes[idx].class_index; - selectedIndicesPtr[2] = filtBoxes[idx].box_index; - selectedIndicesPtr += selectedIndicesStride; + auto out_ptr = reinterpret_cast(getChildEdgesAtPort(NMS_SELECTED_SCORES)[0]->getMemoryPtr()->getData()); - selectedScoresPtr[0] = static_cast(filtBoxes[idx].batch_index); - selectedScoresPtr[1] = static_cast(filtBoxes[idx].class_index); - selectedScoresPtr[2] = static_cast(filtBoxes[idx].score); - selectedScoresPtr += selectedIndicesStride; + size_t idx = 0lu; + for (; idx < valid_outputs; idx++) { + out_ptr[0] = static_cast(m_filtered_boxes[idx].batch_index); + out_ptr[1] = static_cast(m_filtered_boxes[idx].class_index); + out_ptr[2] = m_filtered_boxes[idx].score; + out_ptr += stride; + } + + if (m_out_static_shape) { + std::fill(out_ptr, out_ptr + (max_number_of_boxes - idx) * stride, -1.f); + } } - if (m_outStaticShape) { - std::fill(selectedIndicesPtr, selectedIndicesPtr + (maxNumberOfBoxes - idx) * selectedIndicesStride, -1); - std::fill(selectedScoresPtr, selectedScoresPtr + (maxNumberOfBoxes - idx) * selectedIndicesStride, -1.f); + if (m_defined_outputs[NMS_VALID_OUTPUTS]) { + auto out_ptr = reinterpret_cast(getChildEdgesAtPort(NMS_VALID_OUTPUTS)[0]->getMemoryPtr()->getData()); + *out_ptr = static_cast(valid_outputs); } - - int *valid_outputs = reinterpret_cast(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->getData()); - *valid_outputs = static_cast(validOutputs); } -bool NonMaxSuppression::created() const { - return getType() == Type::NonMaxSuppression; +void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores, const VectorDims &boxesStrides, + const VectorDims &scoresStrides, std::vector &filtBoxes) { + auto less = [](const boxInfo& l, const boxInfo& r) { + return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); + }; + + // update score, if iou is 0, weight is 1, score does not change + // if is_soft_suppressed_by_iou is false, apply for all iou, including iou>iou_threshold, soft suppressed when score < score_threshold + // if is_soft_suppressed_by_iou is true, hard suppressed by iou_threshold, then soft suppress + auto coeff = [&](float iou) { + if (m_is_soft_suppressed_by_iou && iou > m_iou_threshold) + return 0.0f; + return std::exp(m_scale * iou * iou); + }; + + parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) { + std::vector selectedBoxes; + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::priority_queue, decltype(less)> sorted_boxes(less); // score, box_id, suppress_begin_index + for (int box_idx = 0; box_idx < static_cast(m_boxes_num); box_idx++) { + if (scoresPtr[box_idx] > m_score_threshold) + sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0})); + } + size_t sorted_boxes_size = sorted_boxes.size(); + size_t maxSeletedBoxNum = std::min(sorted_boxes_size, m_output_boxes_per_class); + selectedBoxes.reserve(maxSeletedBoxNum); + if (maxSeletedBoxNum > 0) { + // include first directly + boxInfo candidateBox = sorted_boxes.top(); + sorted_boxes.pop(); + selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx }); + if (maxSeletedBoxNum > 1) { + if (m_jit_kernel) { +#if defined(OPENVINO_ARCH_X86_64) + std::vector boxCoord0(maxSeletedBoxNum, 0.0f); + std::vector boxCoord1(maxSeletedBoxNum, 0.0f); + std::vector boxCoord2(maxSeletedBoxNum, 0.0f); + std::vector boxCoord3(maxSeletedBoxNum, 0.0f); + + boxCoord0[0] = boxesPtr[candidateBox.idx * m_coord_num]; + boxCoord1[0] = boxesPtr[candidateBox.idx * m_coord_num + 1]; + boxCoord2[0] = boxesPtr[candidateBox.idx * m_coord_num + 2]; + boxCoord3[0] = boxesPtr[candidateBox.idx * m_coord_num + 3]; + + auto arg = kernel::NmsCallArgs(); + arg.iou_threshold = static_cast(&m_iou_threshold); + arg.score_threshold = static_cast(&m_score_threshold); + arg.scale = static_cast(&m_scale); + while (selectedBoxes.size() < m_output_boxes_per_class && !sorted_boxes.empty()) { + boxInfo candidateBox = sorted_boxes.top(); + float origScore = candidateBox.score; + sorted_boxes.pop(); + + int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected, 2 for updated + arg.score = static_cast(&candidateBox.score); + arg.selected_boxes_num = selectedBoxes.size() - candidateBox.suppress_begin_index; + arg.selected_boxes_coord[0] = static_cast(&boxCoord0[candidateBox.suppress_begin_index]); + arg.selected_boxes_coord[1] = static_cast(&boxCoord1[candidateBox.suppress_begin_index]); + arg.selected_boxes_coord[2] = static_cast(&boxCoord2[candidateBox.suppress_begin_index]); + arg.selected_boxes_coord[3] = static_cast(&boxCoord3[candidateBox.suppress_begin_index]); + arg.candidate_box = static_cast(&boxesPtr[candidateBox.idx * m_coord_num]); + arg.candidate_status = static_cast(&candidateStatus); + (*m_jit_kernel)(&arg); + + if (candidateStatus == NMSCandidateStatus::SUPPRESSED) { + continue; + } else { + if (candidateBox.score == origScore) { + selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx }); + int selectedSize = selectedBoxes.size(); + boxCoord0[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num]; + boxCoord1[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num + 1]; + boxCoord2[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num + 2]; + boxCoord3[selectedSize - 1] = boxesPtr[candidateBox.idx * m_coord_num + 3]; + } else { + candidateBox.suppress_begin_index = selectedBoxes.size(); + sorted_boxes.push(candidateBox); + } + } + } +#endif // OPENVINO_ARCH_X86_64 + } else { + while (selectedBoxes.size() < m_output_boxes_per_class && !sorted_boxes.empty()) { + boxInfo candidateBox = sorted_boxes.top(); + float origScore = candidateBox.score; + sorted_boxes.pop(); + + int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected, 2 for updated + for (int selected_idx = static_cast(selectedBoxes.size()) - 1; selected_idx >= candidateBox.suppress_begin_index; selected_idx--) { + float iou = intersectionOverUnion(&boxesPtr[candidateBox.idx * m_coord_num], + &boxesPtr[selectedBoxes[selected_idx].box_index * m_coord_num]); + + // when is_soft_suppressed_by_iou is true, score is decayed to zero and implicitely suppressed if iou > iou_threshold. + candidateBox.score *= coeff(iou); + // soft suppressed + if (candidateBox.score <= m_score_threshold) { + candidateStatus = NMSCandidateStatus::SUPPRESSED; + break; + } + } + + if (candidateStatus == NMSCandidateStatus::SUPPRESSED) { + continue; + } else { + if (candidateBox.score == origScore) { + selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx }); + } else { + candidateBox.suppress_begin_index = selectedBoxes.size(); + sorted_boxes.push(candidateBox); + } + } + } + } + } + } + m_num_filtered_boxes[batch_idx][class_idx] = selectedBoxes.size(); + size_t offset = batch_idx * m_classes_num * m_output_boxes_per_class + class_idx * m_output_boxes_per_class; + for (size_t i = 0; i < selectedBoxes.size(); i++) { + filtBoxes[offset + i] = selectedBoxes[i]; + } + }); } +void NonMaxSuppression::nmsWithoutSoftSigma(const float *boxes, const float *scores, const VectorDims &boxesStrides, + const VectorDims &scoresStrides, std::vector &filtBoxes) { + int max_out_box = static_cast(m_output_boxes_per_class); + parallel_for2d(m_batches_num, m_classes_num, [&](int batch_idx, int class_idx) { + const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; + const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; + + std::vector> sorted_boxes; // score, box_idx + sorted_boxes.reserve(m_boxes_num); + for (size_t box_idx = 0; box_idx < m_boxes_num; box_idx++) { + if (scoresPtr[box_idx] > m_score_threshold) { + sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx)); + } + } + + int io_selection_size = 0; + const size_t sortedBoxSize = sorted_boxes.size(); + if (sortedBoxSize > 0lu) { + parallel_sort(sorted_boxes.begin(), sorted_boxes.end(), + [](const std::pair& l, const std::pair& r) { + return (l.first > r.first || ((l.first == r.first) && (l.second < r.second))); + }); + int offset = batch_idx * m_classes_num * m_output_boxes_per_class + class_idx * m_output_boxes_per_class; + filtBoxes[offset + 0] = FilteredBox(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second); + io_selection_size++; + if (sortedBoxSize > 1lu) { + if (m_jit_kernel) { +#if defined(OPENVINO_ARCH_X86_64) + std::vector boxCoord0(sortedBoxSize, 0.0f); + std::vector boxCoord1(sortedBoxSize, 0.0f); + std::vector boxCoord2(sortedBoxSize, 0.0f); + std::vector boxCoord3(sortedBoxSize, 0.0f); + + boxCoord0[0] = boxesPtr[sorted_boxes[0].second * m_coord_num]; + boxCoord1[0] = boxesPtr[sorted_boxes[0].second * m_coord_num + 1]; + boxCoord2[0] = boxesPtr[sorted_boxes[0].second * m_coord_num + 2]; + boxCoord3[0] = boxesPtr[sorted_boxes[0].second * m_coord_num + 3]; + + auto arg = kernel::NmsCallArgs(); + arg.iou_threshold = static_cast(&m_iou_threshold); + arg.score_threshold = static_cast(&m_score_threshold); + arg.scale = static_cast(&m_scale); + // box start index do not change for hard supresion + arg.selected_boxes_coord[0] = static_cast(&boxCoord0[0]); + arg.selected_boxes_coord[1] = static_cast(&boxCoord1[0]); + arg.selected_boxes_coord[2] = static_cast(&boxCoord2[0]); + arg.selected_boxes_coord[3] = static_cast(&boxCoord3[0]); + + for (size_t candidate_idx = 1; (candidate_idx < sortedBoxSize) && (io_selection_size < max_out_box); candidate_idx++) { + int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected + arg.selected_boxes_num = io_selection_size; + arg.candidate_box = static_cast(&boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num]); + arg.candidate_status = static_cast(&candidateStatus); + (*m_jit_kernel)(&arg); + if (candidateStatus == NMSCandidateStatus::SELECTED) { + boxCoord0[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num]; + boxCoord1[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num + 1]; + boxCoord2[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num + 2]; + boxCoord3[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num + 3]; + filtBoxes[offset + io_selection_size] = + FilteredBox(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second); + io_selection_size++; + } + } +#endif // OPENVINO_ARCH_X86_64 + } else { + for (size_t candidate_idx = 1; (candidate_idx < sortedBoxSize) && (io_selection_size < max_out_box); candidate_idx++) { + int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected + for (int selected_idx = io_selection_size - 1; selected_idx >= 0; selected_idx--) { + float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[candidate_idx].second * m_coord_num], + &boxesPtr[filtBoxes[offset + selected_idx].box_index * m_coord_num]); + if (iou >= m_iou_threshold) { + candidateStatus = NMSCandidateStatus::SUPPRESSED; + break; + } + } + + if (candidateStatus == NMSCandidateStatus::SELECTED) { + filtBoxes[offset + io_selection_size] = + FilteredBox(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second); + io_selection_size++; + } + } + } + } + } + + m_num_filtered_boxes[batch_idx][class_idx] = io_selection_size; + }); +} + +////////// Rotated boxes ////////// + +struct RotatedBox { + float x_ctr, y_ctr, w, h, a; +}; + +inline float dot_2d(const NonMaxSuppression::Point2D& A, const NonMaxSuppression::Point2D& B) { + return A.x * B.x + A.y * B.y; +} + +inline float cross_2d(const NonMaxSuppression::Point2D& A, const NonMaxSuppression::Point2D& B) { + return A.x * B.y - B.x * A.y; +} + +inline void getRotatedVertices(const float* box, NonMaxSuppression::Point2D (&pts)[4], bool clockwise) { + auto theta = clockwise ? box[4] : -box[4]; + + auto cos_theta = std::cos(theta) * 0.5f; + auto sin_theta = std::sin(theta) * 0.5f; + + // y: top --> down; x: left --> right + // Left-Down + pts[0].x = box[0] - sin_theta * box[3] - cos_theta * box[2]; + pts[0].y = box[1] + cos_theta * box[3] - sin_theta * box[2]; + // Left-Top + pts[1].x = box[0] + sin_theta * box[3] - cos_theta * box[2]; + pts[1].y = box[1] - cos_theta * box[3] - sin_theta * box[2]; + // Right-Top + pts[2].x = 2 * box[0] - pts[0].x; + pts[2].y = 2 * box[1] - pts[0].y; + // Right-Down + pts[3].x = 2 * box[0] - pts[1].x; + pts[3].y = 2 * box[1] - pts[1].y; +} + +inline float polygonArea(const NonMaxSuppression::Point2D (&q)[24], const int64_t& m) { + if (m <= 2l) { + return 0.f; + } + + float area = 0.f; + size_t mlu = static_cast(m - 1l); + for (size_t i = 1lu; i < mlu; i++) { + area += std::abs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); + } + + return area / 2.f; +} + +inline size_t convexHullGraham(const NonMaxSuppression::Point2D (&p)[24], + const size_t num_in, + NonMaxSuppression::Point2D (&q)[24]) { + OPENVINO_ASSERT(num_in >= 2lu); + + // Step 1: + // Find point with minimum y + // if more than 1 points have the same minimum y, + // pick the one with the minimum x. + size_t t = 0lu; + for (size_t i = 1lu; i < num_in; i++) { + if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { + t = i; + } + } + auto& start = p[t]; // starting point + + // Step 2: + // Subtract starting point from every points (for sorting in the next step) + for (size_t i = 0lu; i < num_in; i++) { + q[i] = p[i] - start; + } + + // Swap the starting point to position 0 + std::swap(q[t], q[0]); + + // Step 3: + // Sort point 1 ~ num_in according to their relative cross-product values + // (essentially sorting according to angles) + // If the angles are the same, sort according to their distance to origin + float dist[24]; + for (size_t i = 0lu; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } + + std::sort(q + 1, q + num_in, [](const NonMaxSuppression::Point2D& A, const NonMaxSuppression::Point2D& B) -> bool { + float temp = cross_2d(A, B); + if (std::abs(temp) < 1e-6f) { + return dot_2d(A, A) < dot_2d(B, B); + } else { + return temp > 0.f; + } + }); + // compute distance to origin after sort, since the points are now different. + for (size_t i = 0lu; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } + + // Step 4: + // Make sure there are at least 2 points (that don't overlap with each other) + // in the stack + size_t k = 1lu; // index of the non-overlapped second point + for (; k < num_in; k++) { + if (dist[k] > 1e-8f) { + break; + } + } + if (k == num_in) { + // We reach the end, which means the convex hull is just one point + q[0] = p[t]; + return 1lu; + } + q[1] = q[k]; + size_t m = 2lu; // 2 points in the stack + // Step 5: + // Finally we can start the scanning process. + // When a non-convex relationship between the 3 points is found + // (either concave shape or duplicated points), + // we pop the previous point from the stack + // until the 3-point relationship is convex again, or + // until the stack only contains two points + for (size_t i = k + 1lu; i < num_in; i++) { + while (m > 1lu && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { + m--; + } + q[m++] = q[i]; + } + + return m; +} + +inline size_t getIntersectionPoints(const NonMaxSuppression::Point2D (&pts1)[4], + const NonMaxSuppression::Point2D (&pts2)[4], + NonMaxSuppression::Point2D (&intersections)[24]) { + // Line vector + // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] + NonMaxSuppression::Point2D vec1[4], vec2[4]; + for (size_t i = 0lu; i < 4lu; i++) { + vec1[i] = pts1[(i + 1lu) % 4lu] - pts1[i]; + vec2[i] = pts2[(i + 1lu) % 4lu] - pts2[i]; + } + + // Line test - test all line combos for intersection + size_t num = 0lu; // number of intersections + for (size_t i = 0lu; i < 4lu; i++) { + for (size_t j = 0lu; j < 4lu; j++) { + // Solve for 2x2 Ax=b + float det = cross_2d(vec2[j], vec1[i]); + + // This takes care of parallel lines + if (std::abs(det) <= 1e-14f) { + continue; + } + + auto vec12 = pts2[j] - pts1[i]; + + auto t1 = cross_2d(vec2[j], vec12) / det; + auto t2 = cross_2d(vec1[i], vec12) / det; + + if (t1 >= 0.f && t1 <= 1.f && t2 >= 0.f && t2 <= 1.f) { + intersections[num++] = pts1[i] + vec1[i] * t1; + } + } + } + + // Check for vertices of rect1 inside rect2 + { + const auto& AB = vec2[0]; + const auto& DA = vec2[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (size_t i = 0lu; i < 4lu; i++) { + // Assume ABCD is the rectangle, and P is the point to be judged + // P is inside ABCD if P's projection on AB lies within AB + // and P's projection on AD lies within AD + + auto AP = pts1[i] - pts2[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) { + intersections[num++] = pts1[i]; + } + } + } + + // Reverse the check - check for vertices of rect2 inside rect1 + { + const auto& AB = vec1[0]; + const auto& DA = vec1[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (size_t i = 0lu; i < 4lu; i++) { + auto AP = pts2[i] - pts1[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) { + intersections[num++] = pts2[i]; + } + } + } + + return num; +} + +inline float rotatedBoxesIntersection(const NonMaxSuppression::Point2D (&vertices_0)[4], const float* box_1, const bool clockwise) { + // There are up to 4 x 4 + 4 + 4 = 24 intersections (including duplicates) returned + NonMaxSuppression::Point2D intersect_pts[24], ordered_pts[24]; + + NonMaxSuppression::Point2D vertices_1[4]; + getRotatedVertices(box_1, vertices_1, clockwise); + + auto num = getIntersectionPoints(vertices_0, vertices_1, intersect_pts); + + if (num <= 2lu) { + return 0.f; + } + + auto num_convex = convexHullGraham(intersect_pts, num, ordered_pts); + return polygonArea(ordered_pts, num_convex); +} + +inline float NonMaxSuppression::rotatedIntersectionOverUnion(const NonMaxSuppression::Point2D (&vertices_0)[4], const float area_0, const float* box_1) { + const auto area_1 = box_1[2] * box_1[3]; // W x H + if (area_1 <= 0.f) { + return 0.f; + } + + const auto intersection = rotatedBoxesIntersection(vertices_0, box_1, m_clockwise); + + return intersection / (area_0 + area_1 - intersection); +} + +void NonMaxSuppression::nmsRotated(const float* boxes, const float* scores, const VectorDims& boxes_strides, + const VectorDims& scores_strides, std::vector& filtered_boxes) { + if (m_jit_kernel) { + THROW_CPU_NODE_ERR("does not have implementation of the JIT kernel for Rotated boxes."); + } else { + parallel_for2d(m_batches_num, m_classes_num, [&](int64_t batch_idx, int64_t class_idx) { + const float *boxes_ptr = boxes + batch_idx * boxes_strides[0]; + const float *scores_ptr = scores + batch_idx * scores_strides[0] + class_idx * scores_strides[1]; + + std::vector> sorted_indices; // score, box_idx + sorted_indices.reserve(m_boxes_num); + for (size_t box_idx = 0lu; box_idx < m_boxes_num; box_idx++, scores_ptr++) { + if (*scores_ptr > m_score_threshold) { + sorted_indices.emplace_back(std::make_pair(*scores_ptr, box_idx)); + } + } + + size_t io_selection_size = 0lu; + const size_t sorted_boxes_size = sorted_indices.size(); + + if (sorted_boxes_size > 0lu) { + parallel_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair& l, const std::pair& r) { + return (l.first > r.first || ((l.first == r.first) && (l.second < r.second))); + }); + auto sorted_indices_ptr = sorted_indices.data(); + auto filtered_boxes_ptr = filtered_boxes.data() + + batch_idx * m_classes_num * m_output_boxes_per_class + class_idx * m_output_boxes_per_class; + *filtered_boxes_ptr = FilteredBox(sorted_indices[0].first, batch_idx, class_idx, sorted_indices[0].second); + io_selection_size++; + if (sorted_boxes_size > 1lu) { + sorted_indices_ptr++; + NMSCandidateStatus candidate_status; + + for (size_t candidate_idx = 1lu; (candidate_idx < sorted_boxes_size) && (io_selection_size < m_output_boxes_per_class); + candidate_idx++, sorted_indices_ptr++) { + candidate_status = NMSCandidateStatus::SELECTED; + auto box_0 = boxes_ptr + (*sorted_indices_ptr).second * m_coord_num; + const auto area_0 = box_0[2] * box_0[3]; // W x H + + if (area_0 > 0.f) { + NonMaxSuppression::Point2D vertices_0[4]; + getRotatedVertices(box_0, vertices_0, m_clockwise); + auto trg_boxes = reinterpret_cast(&((*filtered_boxes_ptr).box_index)); + for (size_t selected_idx = 0lu; selected_idx < io_selection_size; selected_idx++, trg_boxes -= 4) { + auto iou = rotatedIntersectionOverUnion(vertices_0, area_0, boxes_ptr + m_coord_num * (*trg_boxes)); + if (iou > m_iou_threshold) { + candidate_status = NMSCandidateStatus::SUPPRESSED; + break; + } + } + } else if (0.f > m_iou_threshold) { + candidate_status = NMSCandidateStatus::SUPPRESSED; + } + + if (candidate_status == NMSCandidateStatus::SELECTED) { + *(++filtered_boxes_ptr) = + FilteredBox((*sorted_indices_ptr).first, batch_idx, class_idx, (*sorted_indices_ptr).second); + io_selection_size++; + } + } + } + } + + m_num_filtered_boxes[batch_idx][class_idx] = io_selection_size; + }); + } +} + +/////////////// End of Rotated boxes /////////////// + float NonMaxSuppression::intersectionOverUnion(const float *boxesI, const float *boxesJ) { float yminI, xminI, ymaxI, xmaxI, yminJ, xminJ, ymaxJ, xmaxJ; if (boxEncodingType == NMSBoxEncodeType::CENTER) { @@ -870,240 +892,27 @@ float NonMaxSuppression::intersectionOverUnion(const float *boxesI, const float return intersection_area / (areaI + areaJ - intersection_area); } -void NonMaxSuppression::nmsWithSoftSigma(const float *boxes, const float *scores, const VectorDims &boxesStrides, - const VectorDims &scoresStrides, std::vector &filtBoxes) { - auto less = [](const boxInfo& l, const boxInfo& r) { - return l.score < r.score || ((l.score == r.score) && (l.idx > r.idx)); - }; - - // update score, if iou is 0, weight is 1, score does not change - // if is_soft_suppressed_by_iou is false, apply for all iou, including iou>iou_threshold, soft suppressed when score < score_threshold - // if is_soft_suppressed_by_iou is true, hard suppressed by iou_threshold, then soft suppress - auto coeff = [&](float iou) { - if (isSoftSuppressedByIOU && iou > iouThreshold) - return 0.0f; - return std::exp(scale * iou * iou); - }; - - parallel_for2d(numBatches, numClasses, [&](int batch_idx, int class_idx) { - std::vector selectedBoxes; - const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; - const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; - - std::priority_queue, decltype(less)> sorted_boxes(less); // score, box_id, suppress_begin_index - for (int box_idx = 0; box_idx < static_cast(numBoxes); box_idx++) { - if (scoresPtr[box_idx] > scoreThreshold) - sorted_boxes.emplace(boxInfo({scoresPtr[box_idx], box_idx, 0})); - } - size_t sortedBoxSize = sorted_boxes.size(); - size_t maxSeletedBoxNum = std::min(sortedBoxSize, maxOutputBoxesPerClass); - selectedBoxes.reserve(maxSeletedBoxNum); - if (maxSeletedBoxNum > 0) { - // include first directly - boxInfo candidateBox = sorted_boxes.top(); - sorted_boxes.pop(); - selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx }); - if (maxSeletedBoxNum > 1) { - if (nms_kernel) { - std::vector boxCoord0(maxSeletedBoxNum, 0.0f); - std::vector boxCoord1(maxSeletedBoxNum, 0.0f); - std::vector boxCoord2(maxSeletedBoxNum, 0.0f); - std::vector boxCoord3(maxSeletedBoxNum, 0.0f); - - boxCoord0[0] = boxesPtr[candidateBox.idx * 4]; - boxCoord1[0] = boxesPtr[candidateBox.idx * 4 + 1]; - boxCoord2[0] = boxesPtr[candidateBox.idx * 4 + 2]; - boxCoord3[0] = boxesPtr[candidateBox.idx * 4 + 3]; - - auto arg = jit_nms_args(); - arg.iou_threshold = static_cast(&iouThreshold); - arg.score_threshold = static_cast(&scoreThreshold); - arg.scale = static_cast(&scale); - while (selectedBoxes.size() < maxOutputBoxesPerClass && !sorted_boxes.empty()) { - boxInfo candidateBox = sorted_boxes.top(); - float origScore = candidateBox.score; - sorted_boxes.pop(); - - int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected, 2 for updated - arg.score = static_cast(&candidateBox.score); - arg.selected_boxes_num = selectedBoxes.size() - candidateBox.suppress_begin_index; - arg.selected_boxes_coord[0] = static_cast(&boxCoord0[candidateBox.suppress_begin_index]); - arg.selected_boxes_coord[1] = static_cast(&boxCoord1[candidateBox.suppress_begin_index]); - arg.selected_boxes_coord[2] = static_cast(&boxCoord2[candidateBox.suppress_begin_index]); - arg.selected_boxes_coord[3] = static_cast(&boxCoord3[candidateBox.suppress_begin_index]); - arg.candidate_box = static_cast(&boxesPtr[candidateBox.idx * 4]); - arg.candidate_status = static_cast(&candidateStatus); - (*nms_kernel)(&arg); - - if (candidateStatus == NMSCandidateStatus::SUPPRESSED) { - continue; - } else { - if (candidateBox.score == origScore) { - selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx }); - int selectedSize = selectedBoxes.size(); - boxCoord0[selectedSize - 1] = boxesPtr[candidateBox.idx * 4]; - boxCoord1[selectedSize - 1] = boxesPtr[candidateBox.idx * 4 + 1]; - boxCoord2[selectedSize - 1] = boxesPtr[candidateBox.idx * 4 + 2]; - boxCoord3[selectedSize - 1] = boxesPtr[candidateBox.idx * 4 + 3]; - } else { - candidateBox.suppress_begin_index = selectedBoxes.size(); - sorted_boxes.push(candidateBox); - } - } - } - } else { - while (selectedBoxes.size() < maxOutputBoxesPerClass && !sorted_boxes.empty()) { - boxInfo candidateBox = sorted_boxes.top(); - float origScore = candidateBox.score; - sorted_boxes.pop(); - - int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected, 2 for updated - for (int selected_idx = static_cast(selectedBoxes.size()) - 1; selected_idx >= candidateBox.suppress_begin_index; selected_idx--) { - float iou = intersectionOverUnion(&boxesPtr[candidateBox.idx * 4], &boxesPtr[selectedBoxes[selected_idx].box_index * 4]); - - // when is_soft_suppressed_by_iou is true, score is decayed to zero and implicitely suppressed if iou > iou_threshold. - candidateBox.score *= coeff(iou); - // soft suppressed - if (candidateBox.score <= scoreThreshold) { - candidateStatus = NMSCandidateStatus::SUPPRESSED; - break; - } - } - - if (candidateStatus == NMSCandidateStatus::SUPPRESSED) { - continue; - } else { - if (candidateBox.score == origScore) { - selectedBoxes.push_back({ candidateBox.score, batch_idx, class_idx, candidateBox.idx }); - } else { - candidateBox.suppress_begin_index = selectedBoxes.size(); - sorted_boxes.push(candidateBox); - } - } - } - } - } - } - numFiltBox[batch_idx][class_idx] = selectedBoxes.size(); - size_t offset = batch_idx*numClasses*maxOutputBoxesPerClass + class_idx*maxOutputBoxesPerClass; - for (size_t i = 0; i < selectedBoxes.size(); i++) { - filtBoxes[offset + i] = selectedBoxes[i]; - } - }); -} - -void NonMaxSuppression::nmsWithoutSoftSigma(const float *boxes, const float *scores, const VectorDims &boxesStrides, - const VectorDims &scoresStrides, std::vector &filtBoxes) { - int max_out_box = static_cast(maxOutputBoxesPerClass); - parallel_for2d(numBatches, numClasses, [&](int batch_idx, int class_idx) { - const float *boxesPtr = boxes + batch_idx * boxesStrides[0]; - const float *scoresPtr = scores + batch_idx * scoresStrides[0] + class_idx * scoresStrides[1]; - - std::vector> sorted_boxes; // score, box_idx - for (size_t box_idx = 0; box_idx < numBoxes; box_idx++) { - if (scoresPtr[box_idx] > scoreThreshold) - sorted_boxes.emplace_back(std::make_pair(scoresPtr[box_idx], box_idx)); - } - - int io_selection_size = 0; - size_t sortedBoxSize = sorted_boxes.size(); - if (sortedBoxSize > 0) { - parallel_sort(sorted_boxes.begin(), sorted_boxes.end(), - [](const std::pair& l, const std::pair& r) { - return (l.first > r.first || ((l.first == r.first) && (l.second < r.second))); - }); - int offset = batch_idx*numClasses*maxOutputBoxesPerClass + class_idx*maxOutputBoxesPerClass; - filtBoxes[offset + 0] = filteredBoxes(sorted_boxes[0].first, batch_idx, class_idx, sorted_boxes[0].second); - io_selection_size++; - if (sortedBoxSize > 1) { - if (nms_kernel) { - std::vector boxCoord0(sortedBoxSize, 0.0f); - std::vector boxCoord1(sortedBoxSize, 0.0f); - std::vector boxCoord2(sortedBoxSize, 0.0f); - std::vector boxCoord3(sortedBoxSize, 0.0f); - - boxCoord0[0] = boxesPtr[sorted_boxes[0].second * 4]; - boxCoord1[0] = boxesPtr[sorted_boxes[0].second * 4 + 1]; - boxCoord2[0] = boxesPtr[sorted_boxes[0].second * 4 + 2]; - boxCoord3[0] = boxesPtr[sorted_boxes[0].second * 4 + 3]; - - auto arg = jit_nms_args(); - arg.iou_threshold = static_cast(&iouThreshold); - arg.score_threshold = static_cast(&scoreThreshold); - arg.scale = static_cast(&scale); - // box start index do not change for hard supresion - arg.selected_boxes_coord[0] = static_cast(&boxCoord0[0]); - arg.selected_boxes_coord[1] = static_cast(&boxCoord1[0]); - arg.selected_boxes_coord[2] = static_cast(&boxCoord2[0]); - arg.selected_boxes_coord[3] = static_cast(&boxCoord3[0]); - - for (size_t candidate_idx = 1; (candidate_idx < sortedBoxSize) && (io_selection_size < max_out_box); candidate_idx++) { - int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected - arg.selected_boxes_num = io_selection_size; - arg.candidate_box = static_cast(&boxesPtr[sorted_boxes[candidate_idx].second * 4]); - arg.candidate_status = static_cast(&candidateStatus); - (*nms_kernel)(&arg); - if (candidateStatus == NMSCandidateStatus::SELECTED) { - boxCoord0[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4]; - boxCoord1[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4 + 1]; - boxCoord2[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4 + 2]; - boxCoord3[io_selection_size] = boxesPtr[sorted_boxes[candidate_idx].second * 4 + 3]; - filtBoxes[offset + io_selection_size] = - filteredBoxes(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second); - io_selection_size++; - } - } - } else { - for (size_t candidate_idx = 1; (candidate_idx < sortedBoxSize) && (io_selection_size < max_out_box); candidate_idx++) { - int candidateStatus = NMSCandidateStatus::SELECTED; // 0 for suppressed, 1 for selected - for (int selected_idx = io_selection_size - 1; selected_idx >= 0; selected_idx--) { - float iou = intersectionOverUnion(&boxesPtr[sorted_boxes[candidate_idx].second * 4], - &boxesPtr[filtBoxes[offset + selected_idx].box_index * 4]); - if (iou >= iouThreshold) { - candidateStatus = NMSCandidateStatus::SUPPRESSED; - break; - } - } - - if (candidateStatus == NMSCandidateStatus::SELECTED) { - filtBoxes[offset + io_selection_size] = - filteredBoxes(sorted_boxes[candidate_idx].first, batch_idx, class_idx, sorted_boxes[candidate_idx].second); - io_selection_size++; - } - } - } - } - } - - numFiltBox[batch_idx][class_idx] = io_selection_size; - }); -} - -void NonMaxSuppression::checkPrecision(const Precision& prec, const std::vector& precList, - const std::string& name, const std::string& type) { - if (std::find(precList.begin(), precList.end(), prec) == precList.end()) - IE_THROW() << errorPrefix << "has unsupported '" << name << "' " << type << " precision: " << prec; -} - -void NonMaxSuppression::check1DInput(const Shape& shape, const std::vector& precList, - const std::string& name, const size_t port) { - checkPrecision(getOriginalInputPrecisionAtPort(port), precList, name, inType); - +void NonMaxSuppression::check1DInput(const Shape& shape, const std::string& name, const size_t port) { if (shape.getRank() != 0 && shape.getRank() != 1) - IE_THROW() << errorPrefix << "has unsupported '" << name << "' input rank: " << shape.getRank(); + THROW_CPU_NODE_ERR("has unsupported '", name, "' input rank: ", shape.getRank()); if (shape.getRank() == 1) if (shape.getDims()[0] != 1) - IE_THROW() << errorPrefix << "has unsupported '" << name << "' input 1st dimension size: " << MemoryDescUtils::dim2str(shape.getDims()[0]); + THROW_CPU_NODE_ERR("has unsupported '", name, "' input 1st dimension size: ", MemoryDescUtils::dim2str(shape.getDims()[0])); } -void NonMaxSuppression::checkOutput(const Shape& shape, const std::vector& precList, - const std::string& name, const size_t port) { - checkPrecision(getOriginalOutputPrecisionAtPort(port), precList, name, outType); - +void NonMaxSuppression::checkOutput(const Shape& shape, const std::string& name, const size_t port) { if (shape.getRank() != 2) - IE_THROW() << errorPrefix << "has unsupported '" << name << "' output rank: " << shape.getRank(); + THROW_CPU_NODE_ERR("has unsupported '", name, "' output rank: ", shape.getRank()); if (shape.getDims()[1] != 3) - IE_THROW() << errorPrefix << "has unsupported '" << name << "' output 2nd dimension size: " << MemoryDescUtils::dim2str(shape.getDims()[1]); + THROW_CPU_NODE_ERR("has unsupported '", name, "' output 2nd dimension size: ", MemoryDescUtils::dim2str(shape.getDims()[1])); +} + +bool NonMaxSuppression::isExecutable() const { + return isDynamicNode() || Node::isExecutable(); +} + +bool NonMaxSuppression::created() const { + return getType() == Type::NonMaxSuppression; } } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h index 2599fa3843f..6547737ef99 100644 --- a/src/plugins/intel_cpu/src/nodes/non_max_suppression.h +++ b/src/plugins/intel_cpu/src/nodes/non_max_suppression.h @@ -4,82 +4,43 @@ #pragma once -#include -#include -#include -#include -#include +#include "node.h" +#include "kernels/x64/non_max_suppression.hpp" -#define BOX_COORD_NUM 4 - -using namespace InferenceEngine; namespace ov { namespace intel_cpu { namespace node { -enum class NMSBoxEncodeType { - CORNER, - CENTER -}; - enum NMSCandidateStatus { SUPPRESSED = 0, SELECTED = 1, UPDATED = 2 }; -struct jit_nms_config_params { - NMSBoxEncodeType box_encode_type; - bool is_soft_suppressed_by_iou; -}; - -struct jit_nms_args { - const void* selected_boxes_coord[BOX_COORD_NUM]; - size_t selected_boxes_num; - const void* candidate_box; - const void* iou_threshold; - void* candidate_status; - // for soft suppression, score *= scale * iou * iou; - const void* score_threshold; - const void* scale; - void* score; -}; - -struct jit_uni_nms_kernel { - void (*ker_)(const jit_nms_args *); - - void operator()(const jit_nms_args *args) { - assert(ker_); - ker_(args); - } - - explicit jit_uni_nms_kernel(jit_nms_config_params jcp_) : ker_(nullptr), jcp(jcp_) {} - virtual ~jit_uni_nms_kernel() {} - - virtual void create_ker() = 0; - - jit_nms_config_params jcp; -}; - class NonMaxSuppression : public Node { public: - NonMaxSuppression(const std::shared_ptr& op, const GraphContext::CPtr context); + NonMaxSuppression(const std::shared_ptr& op, const GraphContext::CPtr& context); void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void execute(dnnl::stream strm) override; + + void executeDynamicImpl(dnnl::stream strm) override; + bool created() const override; - static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - struct filteredBoxes { + struct FilteredBox { float score; int batch_index; int class_index; int box_index; - filteredBoxes() = default; - filteredBoxes(float _score, int _batch_index, int _class_index, int _box_index) : + FilteredBox() = default; + FilteredBox(float _score, int _batch_index, int _class_index, int _box_index) : score(_score), batch_index(_batch_index), class_index(_class_index), box_index(_box_index) {} }; @@ -89,66 +50,101 @@ public: int suppress_begin_index; }; - float intersectionOverUnion(const float *boxesI, const float *boxesJ); - - void nmsWithSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, - const SizeVector &scoresStrides, std::vector &filtBoxes); - - void nmsWithoutSoftSigma(const float *boxes, const float *scores, const SizeVector &boxesStrides, - const SizeVector &scoresStrides, std::vector &filtBoxes); - - void executeDynamicImpl(dnnl::stream strm) override; - bool isExecutable() const override; + bool needShapeInfer() const override { return false; } + void prepareParams() override; + struct Point2D { + float x, y; + Point2D(const float px = 0.f, const float py = 0.f) : x(px), y(py) {} + Point2D operator+(const Point2D& p) const { + return Point2D(x + p.x, y + p.y); + } + Point2D& operator+=(const Point2D& p) { + x += p.x; + y += p.y; + return *this; + } + Point2D operator-(const Point2D& p) const { + return Point2D(x - p.x, y - p.y); + } + Point2D operator*(const float coeff) const { + return Point2D(x * coeff, y * coeff); + } + }; + private: // input enum { NMS_BOXES, NMS_SCORES, - NMS_MAXOUTPUTBOXESPERCLASS, - NMS_IOUTHRESHOLD, - NMS_SCORETHRESHOLD, - NMS_SOFTNMSSIGMA, + NMS_MAX_OUTPUT_BOXES_PER_CLASS, + NMS_IOU_THRESHOLD, + NMS_SCORE_THRESHOLD, + NMS_SOFT_NMS_SIGMA, }; // output enum { - NMS_SELECTEDINDICES, - NMS_SELECTEDSCORES, - NMS_VALIDOUTPUTS + NMS_SELECTED_INDICES, + NMS_SELECTED_SCORES, + NMS_VALID_OUTPUTS }; - NMSBoxEncodeType boxEncodingType = NMSBoxEncodeType::CORNER; - bool sortResultDescending = true; + float intersectionOverUnion(const float *boxesI, const float *boxesJ); - size_t numBatches = 0; - size_t numBoxes = 0; - size_t numClasses = 0; + float rotatedIntersectionOverUnion(const Point2D (&vertices_0)[4], const float area_0, const float* box_1); - size_t maxOutputBoxesPerClass = 0lu; - float iouThreshold = 0.0f; - float scoreThreshold = 0.0f; - float softNMSSigma = 0.0f; - float scale = 1.f; - // control placeholder for NMS in new opset. - bool isSoftSuppressedByIOU = false; + void nmsWithSoftSigma(const float *boxes, const float *scores, const InferenceEngine::SizeVector &boxesStrides, + const InferenceEngine::SizeVector &scoresStrides, std::vector &filtBoxes); - bool m_outStaticShape = false; + void nmsWithoutSoftSigma(const float *boxes, const float *scores, const InferenceEngine::SizeVector &boxesStrides, + const InferenceEngine::SizeVector &scoresStrides, std::vector &filtBoxes); - std::string errorPrefix; + void nmsRotated(const float *boxes, const float *scores, const InferenceEngine::SizeVector &boxesStrides, + const InferenceEngine::SizeVector &scoresStrides, std::vector &filtBoxes); - std::vector> numFiltBox; - const std::string inType = "input", outType = "output"; + void check1DInput(const Shape& shape, + const std::string& name, + const size_t port); - void checkPrecision(const Precision& prec, const std::vector& precList, const std::string& name, const std::string& type); - void check1DInput(const Shape& shape, const std::vector& precList, const std::string& name, const size_t port); - void checkOutput(const Shape& shape, const std::vector& precList, const std::string& name, const size_t port); + void checkOutput(const Shape& shape, + const std::string& name, + const size_t port); void createJitKernel(); - std::shared_ptr nms_kernel = nullptr; + + + NMSBoxEncodeType boxEncodingType = NMSBoxEncodeType::CORNER; + bool m_sort_result_descending = true; + bool m_clockwise = false; + bool m_rotated_boxes = false; + size_t m_coord_num = 1lu; + + size_t m_batches_num = 0lu; + size_t m_boxes_num = 0lu; + size_t m_classes_num = 0lu; + + size_t m_max_output_boxes_per_class = 0lu; // Original value of input NMS_MAX_OUTPUT_BOXES_PER_CLASS + size_t m_output_boxes_per_class = 0lu; // Actual number of output boxes + float m_iou_threshold = 0.f; + float m_score_threshold = 0.f; + float m_soft_nms_sigma = 0.f; + float m_scale = 0.f; + // control placeholder for NMS in new opset. + bool m_is_soft_suppressed_by_iou = false; + + bool m_out_static_shape = false; + + std::vector> m_num_filtered_boxes; + const std::string inType = "input"; + const std::string outType = "output"; + bool m_defined_outputs[NMS_VALID_OUTPUTS + 1] = { false, false, false }; + std::vector m_filtered_boxes; + + std::shared_ptr m_jit_kernel; }; } // namespace node diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 21483175aed..274d23ce23b 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -197,6 +197,8 @@ std::vector disabledTestPatterns() { R"(.*RDFTLayerTest.*SignalSize=().*)", // Issue: 123815 (Tests are sensintive to available thread count on testing machines) R"(.*smoke_Snippets_MHA_.?D_SplitDimensionM.*)", + // Issue: 122356 + R"(.*NmsRotatedOpTest.*(SortDesc=True|Clockwise=False).*)", }; #if defined(OPENVINO_ARCH_X86) diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp new file mode 100644 index 00000000000..7888a88a602 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/instances/common/nms_rotated.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "single_op_tests/nms_rotated.hpp" + +using namespace LayerTestsDefinitions; +using namespace ov::test; + + +static const std::vector> input_shapes = { + { + { {}, {{1, 5, 5}} }, + { {}, {{1, 7, 5}} } + }, + { + { {}, {{2, 9, 5}} }, + { {}, {{2, 15, 9}} } + }, + { + { {}, {{5, 17, 5}} }, + { {}, {{5, 7, 17}} } + }, + { + { {}, {{9, 75, 5}} }, + { {}, {{9, 55, 75}} } + }, + { + { {-1, -1, 5}, {{5, 20, 5}, {3, 50, 5}, {2, 99, 5}} }, + { {-1, -1, -1}, {{5, 30, 20}, {3, 100, 50}, {2, 133, 99}} } + } +}; + +static const std::vector> input_shapes_nightly = { + { + { {}, {{3, 11, 5}} }, + { {}, {{3, 15, 11}} } + }, + { + { {}, {{15, 29, 5}} }, + { {}, {{15, 31, 29}} } + }, + { + { {}, {{21, 64, 5}} }, + { {}, {{21, 32, 64}} } + }, + { + { {-1, -1, 5}, {{7, 35, 5}, {7, 35, 5}, {7, 35, 5}} }, + { {-1, -1, -1}, {{7, 30, 35}, {7, 100, 35}, {7, 133, 35}} } + } +}; + +const ov::AnyMap empty_plugin_config{}; + +INSTANTIATE_TEST_SUITE_P(smoke_, NmsRotatedOpTest, + ::testing::Combine( + ::testing::ValuesIn(input_shapes), // Input shapes + ::testing::Values(ElementType::f32), // Boxes and scores input precisions + ::testing::Values(ElementType::i32), // Max output boxes input precisions + ::testing::Values(ElementType::f32), // Thresholds precisions + ::testing::Values(ElementType::i32), // Output type + ::testing::Values(5, 20), // Max output boxes per class + ::testing::Values(0.3f, 0.7f), // IOU threshold + ::testing::Values(0.3f, 0.7f), // Score threshold + ::testing::Values(true, false), // Sort result descending + ::testing::Values(true, false), // Clockwise + ::testing::Values(false), // Is 1st input constant + ::testing::Values(false), // Is 2nd input constant + ::testing::Values(false), // Is 3rd input constant + ::testing::Values(false), // Is 4th input constant + ::testing::Values(false), // Is 5th input constant + ::testing::Values(empty_plugin_config), // Additional plugin configuration + ::testing::Values(utils::DEVICE_CPU)), // Device name + NmsRotatedOpTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(nightly_, NmsRotatedOpTest, + ::testing::Combine( + ::testing::ValuesIn(input_shapes_nightly), + ::testing::Values(ElementType::f16, ElementType::bf16), + ::testing::Values(ElementType::i64), + ::testing::Values(ElementType::f16, ElementType::bf16), + ::testing::Values(ElementType::i64), + ::testing::Values(10), + ::testing::Values(0.5f), + ::testing::Values(0.4f), + ::testing::Values(true, false), + ::testing::Values(true, false), + ::testing::Values(true, false), + ::testing::Values(true, false), + ::testing::Values(true, false), + ::testing::Values(true, false), + ::testing::Values(true, false), + ::testing::Values(empty_plugin_config), + ::testing::Values(utils::DEVICE_CPU)), + NmsRotatedOpTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp index 072b481dd3c..a43b208ad97 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/non_max_suppression.cpp @@ -43,9 +43,9 @@ using NmsParams = std::tuple; // Device name class NmsLayerCPUTest : public testing::WithParamInterface, virtual public SubgraphBaseTest, public CPUTestsBase { @@ -57,9 +57,9 @@ public: ngraph::helpers::InputLayerType maxOutBoxesType; ThresholdValues thrValues; float iouThr, scoreThr, softNmsSigma; - op::v9::NonMaxSuppression::BoxEncodingType boxEncoding; + ov::op::v9::NonMaxSuppression::BoxEncodingType boxEncoding; bool sortResDescend; - element::Type outType; + ElementType outType; std::string targetDevice; std::tie(inShapeParams, inPrecisions, maxOutBoxesPerClass, thrValues, maxOutBoxesType, boxEncoding, sortResDescend, outType, targetDevice) = obj.param; @@ -115,12 +115,12 @@ protected: ThresholdValues thrValues; ngraph::helpers::InputLayerType maxOutBoxesType; float iouThr, scoreThr, softNmsSigma; - op::v9::NonMaxSuppression::BoxEncodingType boxEncoding; + ov::op::v9::NonMaxSuppression::BoxEncodingType boxEncoding; bool sortResDescend; - element::Type outType; + ElementType outType; std::tie(inShapeParams, inPrecisions, maxOutBoxesPerClass, thrValues, maxOutBoxesType, boxEncoding, sortResDescend, outType, targetDevice) = this->GetParam(); - element::Type paramsPrec, maxBoxPrec, thrPrec; + ElementType paramsPrec, maxBoxPrec, thrPrec; std::tie(paramsPrec, maxBoxPrec, thrPrec) = inPrecisions; std::tie(iouThr, scoreThr, softNmsSigma) = thrValues; @@ -156,7 +156,7 @@ protected: if (maxOutBoxesType == ngraph::helpers::InputLayerType::PARAMETER) { inputDynamicShapes.push_back(ngraph::PartialShape{1}); - params.push_back(std::make_shared(element::Type_t::i32, inputDynamicShapes.back())); + params.push_back(std::make_shared(ElementType::i32, inputDynamicShapes.back())); params[1]->set_friendly_name("param_3"); maxOutBoxesPerClassNode = params.back(); } else { @@ -166,7 +166,7 @@ protected: auto iouThrNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector{iouThr})->output(0); auto scoreThrNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector{scoreThr})->output(0); auto softNmsSigmaNode = builder::makeConstant(thrPrec, ngraph::Shape{}, std::vector{softNmsSigma})->output(0); - auto nms = std::make_shared(params[0], params[1], maxOutBoxesPerClassNode, iouThrNode, scoreThrNode, + auto nms = std::make_shared(params[0], params[1], maxOutBoxesPerClassNode, iouThrNode, scoreThrNode, softNmsSigmaNode, boxEncoding, sortResDescend, outType); function = makeNgraphFunction(paramsPrec, params, nms, "NMS"); @@ -276,7 +276,7 @@ private: expectedList.resize(selected_indices_size); - if (indeces_iter->get_element_type() == ov::element::i32) { + if (indeces_iter->get_element_type() == ElementType::i32) { auto selected_indices_data = indeces_iter->data(); for (size_t i = 0; i < selected_indices_size; i += 3) { @@ -296,7 +296,7 @@ private: } } - if (scores_iter->get_element_type() == ov::element::f32) { + if (scores_iter->get_element_type() == ElementType::f32) { auto selected_scores_data = scores_iter->data(); for (size_t i = 0; i < selected_scores_size; i += 3) { expectedList[i/3].score = selected_scores_data[i+2]; @@ -319,7 +319,7 @@ private: size_t selected_indices_size = indeces_iter->get_size(); const auto selected_scores_data = scores_iter->data(); - if (indeces_iter->get_element_type() == ov::element::i32) { + if (indeces_iter->get_element_type() == ElementType::i32) { const auto selected_indices_data = indeces_iter->data(); for (size_t i = 0; i < selected_indices_size; i += 3) { const int32_t batchId = selected_indices_data[i+0]; @@ -415,10 +415,10 @@ const std::vector inShapeParams = { const std::vector maxOutBoxPerClass = {5, 20}; const std::vector threshold = {0.3f, 0.7f}; const std::vector sigmaThreshold = {0.0f, 0.5f}; -const std::vector encodType = {op::v9::NonMaxSuppression::BoxEncodingType::CENTER, - op::v9::NonMaxSuppression::BoxEncodingType::CORNER}; +const std::vector encodType = {ov::op::v9::NonMaxSuppression::BoxEncodingType::CENTER, + ov::op::v9::NonMaxSuppression::BoxEncodingType::CORNER}; const std::vector sortResDesc = {true, false}; -const std::vector outType = {element::i32, element::i64}; +const std::vector outType = {ElementType::i32, ElementType::i64}; const std::vector maxBoxInputTypes = {ngraph::helpers::InputLayerType::PARAMETER, ngraph::helpers::InputLayerType::CONSTANT}; const auto nmsParams = ::testing::Combine(::testing::ValuesIn(inShapeParams), diff --git a/src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp b/src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp new file mode 100644 index 00000000000..e0b83a31866 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/single_op_tests/nms_rotated.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/single_op/nms_rotated.hpp" + +namespace LayerTestsDefinitions { + +TEST_P(NmsRotatedOpTest, CompareWithRefs) { + run(); +}; + +} // namespace LayerTestsDefinitions diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp new file mode 100644 index 00000000000..ec7b5a32ec3 --- /dev/null +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_op/nms_rotated.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace LayerTestsDefinitions { + +typedef std::tuple< + std::vector, // Input shapes + ov::test::ElementType, // Boxes and scores input precisions + ov::test::ElementType, // Max output boxes input precisions + ov::test::ElementType, // Thresholds precisions + ov::test::ElementType, // Output type + int64_t, // Max output boxes per class + float, // IOU threshold + float, // Score threshold + bool, // Sort result descending + bool, // Clockwise + bool, // Is 1st input constant + bool, // Is 2nd input constant + bool, // Is 3rd input constant + bool, // Is 4th input constant + bool, // Is 5th input constant + ov::AnyMap, // Additional configuration + std::string // Device name +> NmsRotatedParams; + +class NmsRotatedOpTest : public testing::WithParamInterface, + public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void SetUp() override; + + void generate_inputs(const std::vector& target_shapes) override; + +private: + int64_t m_max_out_boxes_per_class; + float m_iou_threshold; + float m_score_threshold; +}; + +} // namespace LayerTestsDefinitions diff --git a/src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp b/src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp new file mode 100644 index 00000000000..c6c9e210633 --- /dev/null +++ b/src/tests/functional/shared_test_classes/src/single_op/nms_rotated.cpp @@ -0,0 +1,207 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/single_op/nms_rotated.hpp" +#include "ov_models/builders.hpp" +#include "common_test_utils/data_utils.hpp" +#include "openvino/op/nms_rotated.hpp" + +using namespace ov::test; + +namespace LayerTestsDefinitions { + +std::string NmsRotatedOpTest::getTestCaseName(const testing::TestParamInfo& obj) { + const auto& in_shapes = std::get<0>(obj.param); + + std::ostringstream result; + + result << "IS=("; + for (size_t i = 0lu; i < in_shapes.size(); i++) { + result << utils::partialShape2str({in_shapes[i].first}) << (i < in_shapes.size() - 1lu ? "_" : ""); + } + result << ")_TS="; + for (size_t i = 0lu; i < in_shapes.front().second.size(); i++) { + result << "{"; + for (size_t j = 0lu; j < in_shapes.size(); j++) { + result << utils::vec2str(in_shapes[j].second[i]) << (j < in_shapes.size() - 1lu ? "_" : ""); + } + result << "}_"; + } + result << "_BoxPrc=" << std::get<1>(obj.param); + result << "_MaxPrc=" << std::get<2>(obj.param); + result << "_ThrPrc=" << std::get<3>(obj.param); + result << "_OutPrc=" << std::get<4>(obj.param); + result << "_MaxBox=" << std::get<5>(obj.param); + result << "_IouThr=" << std::get<6>(obj.param); + result << "_ScoreThr=" << std::get<7>(obj.param); + result << "_SortDesc=" << utils::bool2str(std::get<8>(obj.param)); + result << "_Clockwise=" << utils::bool2str(std::get<9>(obj.param)); + result << "_ConstIn={" << utils::bool2str(std::get<10>(obj.param)) << "," + << utils::bool2str(std::get<11>(obj.param)) << "," + << utils::bool2str(std::get<12>(obj.param)) << "," + << utils::bool2str(std::get<13>(obj.param)) << "," + << utils::bool2str(std::get<14>(obj.param)) << "}"; + + const auto& config = std::get<15>(obj.param); + if (!config.empty()) { + result << "_Config={"; + for (const auto& conf_item : config) { + result << "_" << conf_item.first << "="; + conf_item.second.print(result); + } + result << "}"; + } + + result << "_Device=" << std::get<16>(obj.param); + + return result.str(); +} + +void NmsRotatedOpTest::SetUp() { + const auto& params = this->GetParam(); + const auto& in_shapes = std::get<0>(params); + const auto& boxes_prc = std::get<1>(params); + const auto& max_boxes_prc = std::get<2>(params); + const auto& thresholds_prc = std::get<3>(params); + const auto& out_prc = std::get<4>(params); + m_max_out_boxes_per_class = std::get<5>(params); + m_iou_threshold = std::get<6>(params); + m_score_threshold = std::get<7>(params); + const auto& sort_descending = std::get<8>(params); + const auto& clockwise = std::get<9>(params); + const auto& is_0_in_const = std::get<10>(params); + const auto& is_1_in_const = std::get<11>(params); + const auto& is_2_in_const = std::get<12>(params); + const auto& is_3_in_const = std::get<13>(params); + const auto& is_4_in_const = std::get<14>(params); + configuration = std::get<15>(params); + targetDevice = std::get<16>(params); + + std::vector actual_shapes; + ov::ParameterVector in_params; + std::vector> inputs; + const auto in_shape_1d = InputShape{{1}, {{1}}}; + +#define CONST_CASE(P, S, H, L) \ + case P: \ + inputs.push_back(ngraph::builder::makeConstant(P, S, std::vector::value_type>{}, true, \ + ov::element_type_traits

::value_type(H), ov::element_type_traits

::value_type(L))); \ + break; + +#define CREATE_INPUT(C, P, S, N, H, L) \ + if (C) { \ + switch (P) { \ + CONST_CASE(ElementType::f32, S.second[0], H, L) \ + CONST_CASE(ElementType::f16, S.second[0], H, L) \ + CONST_CASE(ElementType::bf16, S.second[0], H, L) \ + CONST_CASE(ElementType::i32, S.second[0], H, L) \ + CONST_CASE(ElementType::i64, S.second[0], H, L) \ + default: OPENVINO_THROW("NmsRotated does not support precision ", P, " for the ", N, " input."); \ + } \ + } else { \ + actual_shapes.push_back(S); \ + if (S.first.rank() == 0) { \ + in_params.push_back(std::make_shared(P, S.second.front())); \ + } else { \ + in_params.push_back(std::make_shared(P, S.first)); \ + } \ + in_params.back()->set_friendly_name(N); \ + inputs.push_back(in_params.back()); \ + } + + CREATE_INPUT(is_0_in_const, boxes_prc, in_shapes[0], "Boxes", 30, 10) + CREATE_INPUT(is_1_in_const, boxes_prc, in_shapes[1], "Scores", 1, 0) + CREATE_INPUT(is_2_in_const, max_boxes_prc, in_shape_1d, "MaxOutputBoxesPerClass", m_max_out_boxes_per_class, m_max_out_boxes_per_class) + CREATE_INPUT(is_3_in_const, thresholds_prc, in_shape_1d, "IouThreshold", m_iou_threshold, m_iou_threshold) + CREATE_INPUT(is_4_in_const, thresholds_prc, in_shape_1d, "ScoreThreshold", m_score_threshold, m_score_threshold) + +#undef CONST_CASE +#undef CREATE_INPUT + + init_input_shapes(actual_shapes); + + const auto nms_op = std::make_shared(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], + sort_descending, out_prc, clockwise); + ov::ResultVector results; + for (size_t i = 0lu; i < nms_op->get_output_size(); i++) { + results.push_back(std::make_shared(nms_op->output(i))); + } + + function = std::make_shared(results, in_params, "NMSRotated"); +} + +template +void fill_data(TD* dst, const TS* src, size_t len) { + for (size_t i = 0llu; i < len; i++) { + dst[i] = static_cast(src[i]); + } +} + +void NmsRotatedOpTest::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& func_inputs = function->inputs(); + + for (size_t i = 0llu; i < func_inputs.size(); ++i) { + const auto& func_input = func_inputs[i]; + const auto& name = func_input.get_node()->get_friendly_name(); + const auto& in_prc = func_input.get_element_type(); + auto tensor = ov::Tensor(in_prc, targetInputStaticShapes[i]); + +#define FILL_DATA(P, S, L) \ +case P : \ +fill_data(tensor.data::value_type>(), S, L); break; + +#define GEN_DATA(P, R, S, K) \ +case P : \ +utils::fill_data_random(tensor.data::value_type>(), shape_size(targetInputStaticShapes[i]), R, S, K); break; + + if (name == "Boxes") { + switch (in_prc) { + GEN_DATA(ElementType::f32, 30, 20, 1) + GEN_DATA(ElementType::f16, 30, 20, 1) + GEN_DATA(ElementType::bf16, 30, 20, 1) + default: + OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the Scores input."); + } + } else if (name == "Scores") { + switch (in_prc) { + GEN_DATA(ElementType::f32, 1, 0, 100) + GEN_DATA(ElementType::f16, 1, 0, 100) + GEN_DATA(ElementType::bf16, 1, 0, 100) + default: + OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the Scores input."); + } + } else if (name == "MaxOutputBoxesPerClass") { + switch (in_prc) { + FILL_DATA(ElementType::i64, &m_max_out_boxes_per_class, 1) + FILL_DATA(ElementType::i32, &m_max_out_boxes_per_class, 1) + default: + OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the MaxOutputBoxesPerClass input."); + } + } else if (name == "IouThreshold") { + switch (in_prc) { + FILL_DATA(ElementType::f32, &m_iou_threshold, 1) + FILL_DATA(ElementType::f16, &m_iou_threshold, 1) + FILL_DATA(ElementType::bf16, &m_iou_threshold, 1) + default: + OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the IouThreshold input."); + } + } else if (name == "ScoreThreshold") { + switch (in_prc) { + FILL_DATA(ElementType::f32, &m_score_threshold, 1) + FILL_DATA(ElementType::f16, &m_score_threshold, 1) + FILL_DATA(ElementType::bf16, &m_score_threshold, 1) + default: + OPENVINO_THROW("NmsRotated does not support precision ", in_prc, " for the ScoreThreshold input."); + } + } + +#undef GEN_DATA +#undef FILL_DATA + + inputs.insert({func_input.get_node_shared_ptr(), tensor}); + } +} + +} // namespace LayerTestsDefinitions diff --git a/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv b/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv index fa91f28719a..51b03e9f335 100644 --- a/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv +++ b/src/tests/test_utils/functional_test_utils/layer_tests_summary/skip_configs/CPU/expected_failures_OP.csv @@ -1131,5 +1131,4 @@ conformance_RegionYolo/ReadIRTest.ImportExport/Op=RegionYolo.1_Type=f32_IR=Regio conformance_Add/ReadIRTest.ImportExport/Op=Add.1_Type=i32_IR=28f23780d4ca0d40671caf79d5cd9223ad8f6dc2fa5ade2521f3d99586eeeb7f_Device=CPU_Shape=static_Config=(),9.72615e-07 conformance_Convolution/ReadIRTest.Inference/Op=Convolution.1_Type=f32_IR=c301804445f273eef62f41f02204711d9d6e571da28c76ab447d7d90983b0032_Device=CPU_Shape=dynamic_Config=(),0.000113281 conformance/OpImplCheckTest.checkPluginImplementation/Function=Multinomial_opset13_Device=CPU_Config=(),1 -conformance/OpImplCheckTest.checkPluginImplementation/Function=NMSRotated_opset13_Device=CPU_Config=(),1 conformance/OpImplCheckTest.checkPluginImplementation/Function=LSTMSequence_opset1_Device=CPU_Config=(),1