From c6528ee4ea1dec39c76df18a4c92fd194636b817 Mon Sep 17 00:00:00 2001
From: Konstantin Beluchenko <kostiantyn.bieliuchenko@altran.com>
Date: Wed, 2 Nov 2022 16:16:57 +0200
Subject: [PATCH] [GPU] Matrix NMS (#13137)

---
 .../convert_matrix_nms_to_matrix_nms_ie.cpp   |   5 +-
 .../intel_gpu/plugin/primitives_list.hpp      |   1 +
 .../intel_gpu/primitives/matrix_nms.hpp       | 154 ++++
 .../src/graph/impls/ocl/matrix_nms.cpp        | 117 +++
 .../src/graph/impls/ocl/register.cpp          |   1 +
 .../src/graph/impls/ocl/register.hpp          |   1 +
 .../src/graph/include/matrix_nms_inst.h       |  43 ++
 .../intel_gpu/src/graph/matrix_nms.cpp        |  88 +++
 src/plugins/intel_gpu/src/graph/program.cpp   |   3 +
 .../cl_kernels/matrix_nms_ref.cl              | 333 +++++++++
 .../src/kernel_selector/common_types.h        |   1 +
 .../kernel_selector_common.cpp                |   1 +
 .../matrix_nms/matrix_nms_kernel_ref.cpp      | 196 +++++
 .../matrix_nms/matrix_nms_kernel_ref.h        |  67 ++
 .../matrix_nms/matrix_nms_kernel_selector.cpp |  18 +
 .../matrix_nms/matrix_nms_kernel_selector.h   |  21 +
 .../intel_gpu/src/plugin/ops/matrix_nms.cpp   |  83 +++
 .../src/plugin/transformations_pipeline.cpp   |   2 +
 .../tests/test_cases/matrix_nms_gpu_test.cpp  | 673 ++++++++++++++++++
 .../single_layer_tests/matrix_nms.cpp         |  48 ++
 .../src/single_layer/matrix_nms.cpp           |   5 +-
 21 files changed, 1858 insertions(+), 3 deletions(-)
 create mode 100644 src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp
 create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp
 create mode 100644 src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h
 create mode 100644 src/plugins/intel_gpu/src/graph/matrix_nms.cpp
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h
 create mode 100644 src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp
 create mode 100644 src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp
 create mode 100644 src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp

diff --git a/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp b/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp
index a84dd3886af..9f0d98f2143 100644
--- a/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp
+++ b/src/common/transformations/src/transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.cpp
@@ -14,6 +14,7 @@
 
 #include "itt.hpp"
 #include "ngraph_ops/nms_static_shape_ie.hpp"
+#include "transformations/utils/utils.hpp"
 
 ngraph::pass::ConvertMatrixNmsToMatrixNmsIE::ConvertMatrixNmsToMatrixNmsIE(bool force_i32_output_type) {
     MATCHER_SCOPE(ConvertMatrixNmsToMatrixNmsIE);
@@ -46,13 +47,13 @@ ngraph::pass::ConvertMatrixNmsToMatrixNmsIE::ConvertMatrixNmsToMatrixNmsIE(bool
 
         if (nms->output(1).get_element_type() != output_1.get_element_type()) {
             output_1 = std::make_shared<opset1::Convert>(output_1, nms->output(1).get_element_type());
-            output_1.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.1");
+            output_1.get_node_shared_ptr()->set_friendly_name(op::util::create_ie_output_name(nms->output(1)));
             new_ops.emplace_back(output_1.get_node_shared_ptr());
         }
 
         if (nms->output(2).get_element_type() != output_2.get_element_type()) {
             output_2 = std::make_shared<opset1::Convert>(output_2, nms->output(2).get_element_type());
-            output_2.get_node_shared_ptr()->set_friendly_name(nms->get_friendly_name() + "/convert.2");
+            output_2.get_node_shared_ptr()->set_friendly_name(op::util::create_ie_output_name(nms->output(2)));
             new_ops.emplace_back(output_2.get_node_shared_ptr());
         }
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
index f2a7c54b292..6e1fa01dae3 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -247,3 +247,4 @@ REGISTER_FACTORY(v9, Eye);
 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
 REGISTER_FACTORY(internal, GenerateProposalsIEInternal);
+REGISTER_FACTORY(internal, NmsStaticShapeIE8);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp
new file mode 100644
index 00000000000..d74d8a62295
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/matrix_nms.hpp
@@ -0,0 +1,154 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include <vector>
+
+#include "ngraph/op/matrix_nms.hpp"
+#include "primitive.hpp"
+
+namespace cldnn {
+/// @addtogroup cpp_api C++ API
+/// @{
+/// @addtogroup cpp_topology Network Topology
+/// @{
+/// @addtogroup cpp_primitives Primitives
+/// @{
+
+/// @brief Performs matrix nms of input boxes and returns indices of selected boxes.
+struct matrix_nms : public primitive_base<matrix_nms> {
+    CLDNN_DECLARE_PRIMITIVE(matrix_nms)
+
+    enum decay_function { gaussian, linear };
+
+    enum sort_result_type {
+        class_id,  // sort selected boxes by class id (ascending) in each batch element
+        score,     // sort selected boxes by score (descending) in each batch element
+        none       // do not guarantee the order in each batch element
+    };
+
+    /// \brief Structure that specifies attributes of the operation
+    struct attributes {
+        // specifies order of output elements
+        sort_result_type sort_type = sort_result_type::none;
+        // specifies whenever it is necessary to sort selected boxes across batches or not
+        bool sort_result_across_batch = false;
+        // specifies minimum score to consider box for the processing
+        float score_threshold = 0.0f;
+        // specifies maximum number of boxes to be selected per class, -1 meaning to
+        // keep all boxes
+        int nms_top_k = -1;
+        // specifies maximum number of boxes to be selected per batch element, -1
+        // meaning to keep all boxes
+        int keep_top_k = -1;
+        // specifies the background class id, -1 meaning to keep all classes
+        int background_class = -1;
+        // specifies decay function used to decay scores
+        decay_function decay = decay_function::linear;
+        // specifies gaussian_sigma parameter for gaussian decay_function
+        float gaussian_sigma = 2.0f;
+        // specifies threshold to filter out boxes with low confidence score after
+        // decaying
+        float post_threshold = 0.0f;
+        // specifies whether boxes are normalized or not
+        bool normalized = true;
+
+        attributes() {}
+
+        attributes(const ngraph::op::v8::MatrixNms::Attributes& attrs)
+            : attributes(from(attrs.sort_result_type),
+                         attrs.sort_result_across_batch,
+                         attrs.score_threshold,
+                         attrs.nms_top_k,
+                         attrs.keep_top_k,
+                         attrs.background_class,
+                         from(attrs.decay_function),
+                         attrs.gaussian_sigma,
+                         attrs.post_threshold,
+                         attrs.normalized) {}
+
+        attributes(sort_result_type sort_type,
+                   bool sort_result_across_batch,
+                   float score_threshold,
+                   int nms_top_k,
+                   int keep_top_k,
+                   int background_class,
+                   decay_function decay,
+                   float gaussian_sigma,
+                   float post_threshold,
+                   bool normalized)
+            : sort_type(sort_type),
+              sort_result_across_batch(sort_result_across_batch),
+              score_threshold(score_threshold),
+              nms_top_k(nms_top_k),
+              keep_top_k(keep_top_k),
+              background_class(background_class),
+              decay(decay),
+              gaussian_sigma(gaussian_sigma),
+              post_threshold(post_threshold),
+              normalized(normalized) {}
+    };
+
+    /// @brief Constructs matrix_nms primitive.
+    /// @param id This primitive id.
+    /// @param boxes primitive id.
+    /// @param scores primitive id.
+    /// @param second_output primitive id.
+    /// @param third_output primitive id.
+    /// @param attrs attributes.
+    matrix_nms(const primitive_id& id,
+               const primitive_id& boxes,
+               const primitive_id& scores,
+               const primitive_id& second_output,
+               const primitive_id& third_output,
+               const matrix_nms::attributes& attrs)
+        : primitive_base(id, {boxes, scores, second_output, third_output}),
+          attribs(attrs) {}
+
+    /// @brief Constructs matrix_nms primitive.
+    /// @param id This primitive id.
+    /// @param boxes primitive id.
+    /// @param scores primitive id.
+    /// @param second_output primitive id.
+    /// @param third_output primitive id.
+    /// @param attrs operation attributes.
+    matrix_nms(const primitive_id& id,
+               const primitive_id& boxes,
+               const primitive_id& scores,
+               const primitive_id& second_output,
+               const primitive_id& third_output,
+               const ngraph::op::v8::MatrixNms::Attributes& attrs)
+        : primitive_base(id, {boxes, scores, second_output, third_output}),
+          attribs(attrs) {}
+
+    attributes attribs;
+
+private:
+    static cldnn::matrix_nms::decay_function from(ngraph::op::v8::MatrixNms::DecayFunction decay) {
+        switch (decay) {
+        case ngraph::op::v8::MatrixNms::DecayFunction::GAUSSIAN:
+            return cldnn::matrix_nms::decay_function::gaussian;
+        case ngraph::op::v8::MatrixNms::DecayFunction::LINEAR:
+        default:
+            return cldnn::matrix_nms::decay_function::linear;
+        }
+    }
+
+    static cldnn::matrix_nms::sort_result_type from(ngraph::op::v8::MatrixNms::SortResultType type) {
+        switch (type) {
+        case ngraph::op::v8::MatrixNms::SortResultType::CLASSID:
+            return cldnn::matrix_nms::sort_result_type::class_id;
+        case ngraph::op::v8::MatrixNms::SortResultType::SCORE:
+            return cldnn::matrix_nms::sort_result_type::score;
+        case ngraph::op::v8::MatrixNms::SortResultType::NONE:
+        default:
+            return cldnn::matrix_nms::sort_result_type::none;
+        }
+    }
+};
+/// @}
+/// @}
+/// @}
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp
new file mode 100644
index 00000000000..23d1b42aa4a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/matrix_nms.cpp
@@ -0,0 +1,117 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "data_inst.h"
+#include "impls/implementation_map.hpp"
+#include "kernel_selector_helper.h"
+#include "matrix_nms/matrix_nms_kernel_ref.h"
+#include "matrix_nms/matrix_nms_kernel_selector.h"
+#include "matrix_nms_inst.h"
+#include "primitive_base.hpp"
+
+namespace cldnn {
+namespace ocl {
+
+namespace {
+kernel_selector::matrix_nms_params::decay_function from(matrix_nms::decay_function decay) {
+    switch (decay) {
+    case matrix_nms::decay_function::gaussian:
+        return kernel_selector::matrix_nms_params::decay_function::GAUSSIAN;
+    default:
+    case matrix_nms::decay_function::linear:
+        return kernel_selector::matrix_nms_params::decay_function::LINEAR;
+    }
+}
+
+kernel_selector::matrix_nms_params::sort_result_type from(matrix_nms::sort_result_type type) {
+    switch (type) {
+    case matrix_nms::sort_result_type::class_id:
+        return kernel_selector::matrix_nms_params::sort_result_type::CLASS_ID;
+    case matrix_nms::sort_result_type::score:
+        return kernel_selector::matrix_nms_params::sort_result_type::SCORE;
+    default:
+    case matrix_nms::sort_result_type::none:
+        return kernel_selector::matrix_nms_params::sort_result_type::NONE;
+    }
+}
+}  // namespace
+
+struct matrix_nms_impl : typed_primitive_impl_ocl<matrix_nms> {
+    using parent = typed_primitive_impl_ocl<matrix_nms>;
+    using parent::parent;
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<matrix_nms_impl>(*this);
+    }
+
+protected:
+    kernel_arguments_data get_arguments(matrix_nms_inst& instance, int32_t) const override {
+        kernel_arguments_data args;
+        args.inputs = {instance.input_boxes_mem(),
+                       instance.input_scores_mem(),
+                       instance.input_selected_boxes_mem(),
+                       instance.input_valid_outputs_mem()};
+        args.outputs = {instance.output_memory_ptr()};
+
+        return args;
+    }
+
+public:
+    static primitive_impl* create(const matrix_nms_node& node, const kernel_impl_params& impl_param) {
+        auto params = get_default_params<kernel_selector::matrix_nms_params>(impl_param);
+        auto optional_params =
+            get_default_optional_params<kernel_selector::matrix_nms_optional_params>(node.get_program());
+
+        const auto& scores_layout = impl_param.get_input_layout(1);
+        const auto& second_output_layout = impl_param.get_input_layout(2);
+        const auto& third_output_layout = impl_param.get_input_layout(3);
+
+        params.inputs.push_back(convert_data_tensor(scores_layout));
+        params.inputs.push_back(convert_data_tensor(second_output_layout));
+        params.inputs.push_back(convert_data_tensor(third_output_layout));
+
+        const auto& primitive = node.get_primitive();
+        params.sort_type = from(primitive->attribs.sort_type);
+        params.sort_result_across_batch = primitive->attribs.sort_result_across_batch;
+        params.score_threshold = primitive->attribs.score_threshold;
+        params.nms_top_k = primitive->attribs.nms_top_k;
+        params.keep_top_k = primitive->attribs.keep_top_k;
+        params.background_class = primitive->attribs.background_class;
+        params.decay = from(primitive->attribs.decay);
+        params.gaussian_sigma = primitive->attribs.gaussian_sigma;
+        params.post_threshold = primitive->attribs.post_threshold;
+        params.normalized = primitive->attribs.normalized;
+
+        auto& kernel_selector = kernel_selector::matrix_nms_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(params, optional_params);
+
+        CLDNN_ERROR_BOOL(node.id(),
+                         "Best_kernel.empty()",
+                         best_kernels.empty(),
+                         "Cannot find a proper kernel with this nodeuments");
+
+        auto matrix_nms_node = new matrix_nms_impl(node, best_kernels[0]);
+
+        return matrix_nms_node;
+    }
+};
+
+namespace detail {
+
+attach_matrix_nms_impl::attach_matrix_nms_impl() {
+    auto types = {data_types::f16, data_types::f32, data_types::i32};
+
+    auto formats = {format::bfyx,
+                    format::b_fs_yx_fsv16,
+                    format::b_fs_yx_fsv32,
+                    format::bs_fs_yx_bsv16_fsv16,
+                    format::bs_fs_yx_bsv32_fsv16,
+                    format::bs_fs_yx_bsv32_fsv32};
+
+    implementation_map<matrix_nms>::add(impl_types::ocl, matrix_nms_impl::create, types, formats);
+}
+
+}  // namespace detail
+}  // namespace ocl
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
index 06b5fd780b1..7061b8e284d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
@@ -50,6 +50,7 @@ void register_implementations() {
     REGISTER_OCL(mutable_data);
     REGISTER_OCL(mvn);
     REGISTER_OCL(non_max_suppression);
+    REGISTER_OCL(matrix_nms);
     REGISTER_OCL(normalize);
     REGISTER_OCL(one_hot);
     REGISTER_OCL(permute);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
index ca6372ce61a..98f5cab6380 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
@@ -131,6 +131,7 @@ REGISTER_OCL(max_unpooling);
 REGISTER_OCL(mutable_data);
 REGISTER_OCL(mvn);
 REGISTER_OCL(non_max_suppression);
+REGISTER_OCL(matrix_nms);
 REGISTER_OCL(normalize);
 REGISTER_OCL(one_hot);
 REGISTER_OCL(permute);
diff --git a/src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h b/src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h
new file mode 100644
index 00000000000..15c400e282a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/include/matrix_nms_inst.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include <memory>
+#include <string>
+
+#include "intel_gpu/primitives/matrix_nms.hpp"
+#include "primitive_inst.h"
+
+namespace cldnn {
+
+using matrix_nms_node = typed_program_node<matrix_nms>;
+
+template <>
+class typed_primitive_inst<matrix_nms> : public typed_primitive_inst_base<matrix_nms> {
+    using parent = typed_primitive_inst_base<matrix_nms>;
+
+public:
+    typed_primitive_inst(network& network, const matrix_nms_node& node) : parent(network, node) {}
+
+    static layout calc_output_layout(const matrix_nms_node& node, const kernel_impl_params& impl_param);
+    static std::string to_string(const matrix_nms_node& node);
+
+    memory::ptr input_boxes_mem() const {
+        return dep_memory_ptr(0);
+    }
+    memory::ptr input_scores_mem() const {
+        return dep_memory_ptr(1);
+    }
+    memory::ptr input_selected_boxes_mem() const {
+        return dep_memory_ptr(2);
+    }
+    memory::ptr input_valid_outputs_mem() const {
+        return dep_memory_ptr(3);
+    }
+};
+
+using matrix_nms_inst = typed_primitive_inst<matrix_nms>;
+
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/matrix_nms.cpp b/src/plugins/intel_gpu/src/graph/matrix_nms.cpp
new file mode 100644
index 00000000000..e7154e28d33
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/matrix_nms.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <json_object.h>
+
+#include <sstream>
+#include <string>
+
+#include "matrix_nms_inst.h"
+#include "openvino/core/enum_names.hpp"
+#include "primitive_type_base.h"
+
+namespace cldnn {
+primitive_type_id matrix_nms::type_id() {
+    static primitive_type_base<matrix_nms> instance;
+    return &instance;
+}
+
+layout matrix_nms_inst::calc_output_layout(const matrix_nms_node& node, const kernel_impl_params& impl_param) {
+    const auto primitive = impl_param.typed_desc<matrix_nms>();
+    const auto boxes_layout = impl_param.get_input_layout(0);
+    const auto scores_layout = impl_param.get_input_layout(1);
+
+    const auto batches_num = boxes_layout.batch();
+    auto classes_num = scores_layout.feature();
+    const auto boxes_num = boxes_layout.feature();
+
+    if (primitive->attribs.background_class >= 0 && primitive->attribs.background_class < classes_num)
+        classes_num = std::max(1, classes_num - 1);
+
+    int max_output_boxes_per_class{boxes_num};
+    if (primitive->attribs.nms_top_k >= 0)
+        max_output_boxes_per_class = std::min(max_output_boxes_per_class, primitive->attribs.nms_top_k);
+
+    auto max_output_boxes_per_batch = max_output_boxes_per_class * classes_num;
+    if (primitive->attribs.keep_top_k >= 0)
+        max_output_boxes_per_batch = std::min(max_output_boxes_per_batch, primitive->attribs.keep_top_k);
+
+    auto output_num = max_output_boxes_per_batch * batches_num;
+
+    // BOX_DATA: class_id, box_score, xmin, ymin, xmax, ymax
+    constexpr size_t BOX_DATA{6};
+    return layout(boxes_layout.data_type, boxes_layout.format, {output_num, BOX_DATA, 1, 1});
+}
+
+std::string matrix_nms_inst::to_string(const matrix_nms_node& node) {
+    json_composite matrix_nms_info;
+    matrix_nms_info.add("boxes id", node.input().id());
+    matrix_nms_info.add("scores id", node.get_dependency(1).id());
+    matrix_nms_info.add("sort_result_type", ov::as_string(node.get_primitive()->attribs.sort_type));
+    matrix_nms_info.add("decay_function", ov::as_string(node.get_primitive()->attribs.decay));
+    matrix_nms_info.add("sort_result_across_batch", node.get_primitive()->attribs.sort_result_across_batch);
+    matrix_nms_info.add("score_threshold", node.get_primitive()->attribs.score_threshold);
+    matrix_nms_info.add("nms_top_k", node.get_primitive()->attribs.nms_top_k);
+    matrix_nms_info.add("keep_top_k", node.get_primitive()->attribs.keep_top_k);
+    matrix_nms_info.add("background_class", node.get_primitive()->attribs.background_class);
+    matrix_nms_info.add("gaussian_sigma", node.get_primitive()->attribs.gaussian_sigma);
+    matrix_nms_info.add("post_threshold", node.get_primitive()->attribs.post_threshold);
+    matrix_nms_info.add("normalized", node.get_primitive()->attribs.normalized);
+
+    auto node_info = node.desc_to_json();
+    node_info->add("matrix_nms info", matrix_nms_info);
+    std::stringstream primitive_description;
+    node_info->dump(primitive_description);
+    return primitive_description.str();
+}
+
+}  // namespace cldnn
+
+namespace ov {
+template <>
+EnumNames<matrix_nms::decay_function>& EnumNames<matrix_nms::decay_function>::get() {
+    static auto enum_names = EnumNames<matrix_nms::decay_function>(
+        "decay_function",
+        {{"gaussian", matrix_nms::decay_function::gaussian}, {"linear", matrix_nms::decay_function::linear}});
+    return enum_names;
+}
+
+template <>
+EnumNames<matrix_nms::sort_result_type>& EnumNames<matrix_nms::sort_result_type>::get() {
+    static auto enum_names =
+        EnumNames<matrix_nms::sort_result_type>("sort_result_type",
+                                                {{"class_id", matrix_nms::sort_result_type::class_id},
+                                                 {"score", matrix_nms::sort_result_type::score},
+                                                 {"none", matrix_nms::sort_result_type::none}});
+    return enum_names;
+}
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index f706c00f26d..96cca47fc09 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -20,6 +20,7 @@
 #include "sliding_window_utils.hpp"
 #include "program_helpers.h"
 
+#include "matrix_nms_inst.h"
 #include "roi_pooling_inst.h"
 #include "reorg_yolo_inst.h"
 #include "eltwise_inst.h"
@@ -1442,6 +1443,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::ctc_loss::type_id() &&
             prim.type() != cldnn::non_max_suppression::type_id() &&
             prim.type() != cldnn::roi_align::type_id() &&
+            prim.type() != cldnn::matrix_nms::type_id() &&
             prim.type() != cldnn::adaptive_pooling::type_id() &&
             prim.type() != cldnn::bucketize::type_id() &&
             prim.type() != cldnn::roll::type_id() &&
@@ -1487,6 +1489,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::ctc_loss::type_id() &&
             prim.type() != cldnn::non_max_suppression::type_id() &&
             prim.type() != cldnn::roi_align::type_id() &&
+            prim.type() != cldnn::matrix_nms::type_id() &&
             prim.type() != cldnn::adaptive_pooling::type_id() &&
             prim.type() != cldnn::bucketize::type_id() &&
             prim.type() != cldnn::roll::type_id() &&
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl
new file mode 100644
index 00000000000..9f186d3b036
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/matrix_nms_ref.cl
@@ -0,0 +1,333 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/data_types.cl"
+
+#define NUM_BATCHES INPUT0_BATCH_NUM
+#define NUM_BOXES   INPUT0_FEATURE_NUM
+#define NUM_CLASSES INPUT1_FEATURE_NUM
+
+typedef struct {
+    int batch_idx;
+    int class_idx;
+    int box_idx;
+    INPUT1_TYPE score;
+} FUNC(BoxInfo);
+
+#define BOX_INFO FUNC(BoxInfo)
+
+#define unroll_for __attribute__((opencl_unroll_hint)) for
+
+inline INPUT1_TYPE FUNC(decay_gaussian)(INPUT1_TYPE iou, INPUT1_TYPE max_iou) {
+    return exp((max_iou * max_iou - iou * iou) * GAUSSIAN_SIGMA);
+}
+
+inline INPUT1_TYPE FUNC(decay_linear)(INPUT1_TYPE iou, INPUT1_TYPE max_iou) {
+    return (INPUT1_VAL_ONE - iou) / (INPUT1_VAL_ONE - max_iou + TINY);
+}
+
+inline void FUNC(swap)(int* a, int* b) {
+    int temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+inline void FUNC(sortIterative)(const __global INPUT1_TYPE* scores,
+                                      const int batchId,
+                                      const int classId,
+                                      int* indices,
+                                      const int size) {
+    for (int i = 1; i <= size; i++) {
+        bool swapped = false;
+        for (int j = 0; j < size - i; j++) {
+            const INPUT1_TYPE score_curr = scores[INPUT1_GET_INDEX(batchId, classId, 0, indices[j])];
+            const INPUT1_TYPE score_next = scores[INPUT1_GET_INDEX(batchId, classId, 0, indices[j + 1])];
+            if (score_curr < score_next) {
+                FUNC_CALL(swap)(&indices[j], &indices[j + 1]);
+                swapped = true;
+            }
+        }
+
+        if (!swapped)
+            break;
+    }
+}
+
+inline void FUNC(swap_boxes)(__global BOX_INFO* a, __global BOX_INFO* b) {
+    BOX_INFO temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+inline void FUNC(sortIterativeBoxes)(__global BOX_INFO* boxes, int l, int h) {
+    for (int i = 1; i < h - l; i++) {
+        bool swapped = false;
+        for (int j = l; j < h - i; j++) {
+            if ((boxes[j].score < boxes[j + 1].score) ||
+                (boxes[j].score == boxes[j + 1].score && boxes[j].class_idx > boxes[j + 1].class_idx) ||
+                (boxes[j].score == boxes[j + 1].score && boxes[j].class_idx == boxes[j + 1].class_idx &&
+                 boxes[j].box_idx > boxes[j + 1].box_idx)) {
+                FUNC_CALL(swap_boxes)(&boxes[j], &boxes[j + 1]);
+                swapped = true;
+            }
+        }
+
+        if (!swapped)
+            break;
+    }
+}
+
+inline void FUNC(sortIterativeBoxesAcrossBatches)(__global BOX_INFO* boxes) {
+    const int size = NUM_BATCHES * NUM_CLASSES * MAX_BOXES_PER_CLASS;
+    for (int i = 1; i < size; i++) {
+        bool swapped = false;
+        for (int j = 0; j < size - i; j++) {
+            __global BOX_INFO* l = boxes + j;
+            __global BOX_INFO* r = boxes + j + 1;
+// sort by score
+#if SORT_TYPE == 1
+            if ((l->score < r->score) || (l->score == r->score && l->batch_idx > r->batch_idx) ||
+                (l->score == r->score && l->batch_idx == r->batch_idx && l->class_idx > r->class_idx) ||
+                (l->score == r->score && l->batch_idx == r->batch_idx && l->class_idx == r->class_idx &&
+                 l->box_idx > r->box_idx)) {
+                FUNC_CALL(swap_boxes)(l, r);
+                swapped = true;
+            }
+// sort by class id
+#elif SORT_TYPE == 0
+            if (r->score != INPUT1_VAL_ZERO &&
+                ((l->score == INPUT1_VAL_ZERO) ||  // case with empty buffer
+                 (l->class_idx > r->class_idx) || (l->class_idx == r->class_idx && l->batch_idx > r->batch_idx) ||
+                 (l->class_idx == r->class_idx && l->batch_idx == r->batch_idx && l->score < r->score) ||
+                 (l->class_idx == r->class_idx && l->batch_idx == r->batch_idx && l->score == r->score &&
+                  l->box_idx > r->box_idx))) {
+                FUNC_CALL(swap_boxes)(l, r);
+                swapped = true;
+            }
+#endif
+        }
+
+        if (!swapped)
+            break;
+    }
+}
+
+inline COORD_TYPE_4 FUNC(getBoxCoords)(const __global INPUT0_TYPE* boxes, const short batch, const ushort box_idx) {
+    COORD_TYPE_4 coords = (COORD_TYPE_4)(boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 0)],
+                                         boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 1)],
+                                         boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 2)],
+                                         boxes[INPUT0_GET_INDEX(batch, box_idx, 0, 3)]);
+
+    // uncomment when flipped coordinates will be fixed in reference impl
+    /*
+    const INPUT0_TYPE x1 = min(coords[0], coords[2]);
+    const INPUT0_TYPE x2 = max(coords[0], coords[2]);
+    const INPUT0_TYPE y1 = min(coords[1], coords[3]);
+    const INPUT0_TYPE y2 = max(coords[1], coords[3]);
+    coords[0] = x1;
+    coords[1] = y1;
+    coords[2] = x2;
+    coords[3] = y2;
+    */
+
+    return coords;
+}
+
+inline INPUT0_TYPE FUNC(area)(const INPUT0_TYPE w, const INPUT0_TYPE h) {
+    return (w + NORM) * (h + NORM);
+}
+
+inline INPUT0_TYPE FUNC(areaBox)(const COORD_TYPE_4 box) {
+    if (box[2] < box[0] || box[3] < box[1])
+        return INPUT0_VAL_ZERO;
+    return FUNC_CALL(area)(box[3] - box[1], box[2] - box[0]);
+}
+
+inline INPUT0_TYPE FUNC(intersectionOverUnion)(const COORD_TYPE_4 box1, const COORD_TYPE_4 box2) {
+    if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || box2[3] < box1[1])
+        return INPUT0_VAL_ZERO;
+
+    const INPUT0_TYPE area = FUNC_CALL(areaBox)(box1);
+    const INPUT0_TYPE areaBox = FUNC_CALL(areaBox)(box2);
+
+    const INPUT0_TYPE intersection_xmin = max(box1[0], box2[0]);
+    const INPUT0_TYPE intersection_ymin = max(box1[1], box2[1]);
+    const INPUT0_TYPE intersection_xmax = min(box1[2], box2[2]);
+    const INPUT0_TYPE intersection_ymax = min(box1[3], box2[3]);
+
+    const INPUT0_TYPE intersection_area =
+        FUNC_CALL(area)(intersection_xmax - intersection_xmin, intersection_ymax - intersection_ymin);
+    const INPUT0_TYPE union_area = area + areaBox - intersection_area;
+
+    return intersection_area / union_area;
+}
+
+#ifdef MATRIX_NMS_STAGE_0
+KERNEL(matrix_nms_ref_stage_0)
+(const __global INPUT0_TYPE* input_boxes,
+ const __global INPUT1_TYPE* input_scores,
+ __global uchar* buffer0,
+ __global int* selected_boxes_num) {
+    const int batchId = get_global_id(0);
+    const int classId = get_global_id(1);
+
+    if (classId == BACKGROUND_CLASS)
+        return;
+
+    int sorted_score_indices[NUM_BOXES];
+
+    for (int i = 0; i < NUM_BOXES; ++i)
+        sorted_score_indices[i] = i;
+
+    int valid_boxes_num = 0;
+    for (int i = 0; i < NUM_BOXES; i++) {
+        if (input_scores[INPUT1_GET_INDEX(batchId, classId, 0, i)] > SCORE_THRESHOLD)
+            ++valid_boxes_num;
+    }
+
+    // TODO: consider faster sorting algorithm
+    FUNC_CALL(sortIterative)(input_scores, batchId, classId, sorted_score_indices, NUM_BOXES);
+
+    valid_boxes_num = min(valid_boxes_num, MAX_BOXES_PER_CLASS);
+
+    const int matrix_size = MAX_BOXES_PER_CLASS < 3 ? 1 : (MAX_BOXES_PER_CLASS * (MAX_BOXES_PER_CLASS - 1)) >> 1;
+    INPUT1_TYPE iou_matrix[matrix_size];
+    INPUT1_TYPE iou_max[MAX_BOXES_PER_CLASS];
+
+    iou_max[0] = INPUT1_VAL_ZERO;
+    for (int i = 1; i < valid_boxes_num; ++i) {
+        INPUT1_TYPE max_iou = INPUT1_VAL_ZERO;
+        const COORD_TYPE_4 box_i = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[i]);
+        for (int j = 0; j < i; ++j) {
+            const COORD_TYPE_4 box_j = FUNC_CALL(getBoxCoords)(input_boxes, batchId, sorted_score_indices[j]);
+            const INPUT1_TYPE iou = FUNC_CALL(intersectionOverUnion)(box_i, box_j);
+
+            max_iou = max(iou, max_iou);
+            iou_matrix[i * (i - 1) / 2 + j] = iou;
+        }
+        iou_max[i] = max_iou;
+    }
+
+    const INPUT1_TYPE first_score = input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[0])];
+
+    __global BOX_INFO* box_info = (__global BOX_INFO*)buffer0;
+    box_info = &box_info[batchId * NUM_CLASSES * MAX_BOXES_PER_CLASS + classId * MAX_BOXES_PER_CLASS];
+
+    int box_info_counter = 0;
+    if (first_score > POST_THRESHOLD && valid_boxes_num > 0) {
+        box_info[box_info_counter].class_idx = classId;
+        box_info[box_info_counter].score = first_score;
+        box_info[box_info_counter].box_idx = sorted_score_indices[0];
+        box_info[box_info_counter].batch_idx = batchId;
+        ++box_info_counter;
+    }
+
+    for (int i = 1; i < valid_boxes_num; ++i) {
+        INPUT1_TYPE min_decay = INPUT1_VAL_ONE;
+        for (int j = 0; j < i; ++j) {
+            INPUT1_TYPE iou = iou_matrix[i * (i - 1) / 2 + j];
+            INPUT1_TYPE decay =
+                DECAY_FUNC == 0 ? FUNC_CALL(decay_gaussian)(iou, iou_max[j]) : FUNC_CALL(decay_linear)(iou, iou_max[j]);
+            min_decay = min(min_decay, decay);
+        }
+
+        INPUT1_TYPE ds = min_decay * input_scores[INPUT1_GET_INDEX(batchId, classId, 0, sorted_score_indices[i])];
+
+        if (ds <= POST_THRESHOLD)
+            continue;
+
+        box_info[box_info_counter].batch_idx = batchId;
+        box_info[box_info_counter].class_idx = classId;
+        box_info[box_info_counter].box_idx = sorted_score_indices[i];
+        box_info[box_info_counter].score = ds;
+        ++box_info_counter;
+    }
+
+    selected_boxes_num[batchId * NUM_CLASSES + classId] = box_info_counter;
+}
+#endif /* MATRIX_NMS_STAGE_0 */
+
+#ifdef MATRIX_NMS_STAGE_1
+KERNEL(matrix_nms_ref_stage_1)
+(__global INPUT3_TYPE* valid_outputs, __global uchar* buffer0, __global int* selected_boxes_num) {
+    const int batchId = get_global_id(0);
+
+    __global BOX_INFO* box_info = (__global BOX_INFO*)buffer0;
+
+    const int first_idx = batchId * NUM_CLASSES * MAX_BOXES_PER_CLASS;
+    const int last_idx = first_idx + NUM_CLASSES * MAX_BOXES_PER_CLASS;
+
+    // TODO: consider faster sorting algorithm
+    FUNC_CALL(sortIterativeBoxes)(box_info, first_idx, last_idx);
+
+    for (int i = 0; i < NUM_CLASSES; ++i) {
+        if (i == BACKGROUND_CLASS)
+            continue;
+
+        valid_outputs[INPUT3_GET_INDEX(batchId, 0, 0, 0)] += selected_boxes_num[batchId * NUM_CLASSES + i];
+    }
+}
+#endif /* MATRIX_NMS_STAGE_1 */
+
+#ifdef MATRIX_NMS_STAGE_2
+KERNEL(matrix_nms_ref_stage_2)
+(const __global INPUT0_TYPE* input_boxes,
+ __global OUTPUT_TYPE* output,
+ __global INPUT2_TYPE* selected_indices,
+ __global INPUT3_TYPE* valid_outputs,
+ __global uchar* buffer0) {
+    __global BOX_INFO* box_info = (__global BOX_INFO*)buffer0;
+
+    // TODO: consider faster sorting algorithm
+    // and index sorting instead of data sorting
+#if SORT_RESULT_ACROSS_BATCH == 1 && SORT_TYPE != 2
+    FUNC_CALL(sortIterativeBoxesAcrossBatches)(box_info);
+#endif
+
+    int output_idx = 0;
+    int box_info_idx = 0;
+    for (int i = 0; i < NUM_BATCHES; ++i) {
+        if (KEEP_TOP_K != -1 && KEEP_TOP_K < valid_outputs[INPUT3_GET_INDEX(i, 0, 0, 0)])
+            valid_outputs[INPUT3_GET_INDEX(i, 0, 0, 0)] = KEEP_TOP_K;
+
+#if SORT_RESULT_ACROSS_BATCH == 0
+        box_info_idx = i * NUM_CLASSES * MAX_BOXES_PER_CLASS;
+#endif
+
+        unroll_for(int j = 0; j < valid_outputs[INPUT3_GET_INDEX(i, 0, 0, 0)]; ++j) {
+            output[OUTPUT_GET_INDEX(output_idx, 0, 0, 0)] = box_info[box_info_idx].class_idx;
+            output[OUTPUT_GET_INDEX(output_idx, 1, 0, 0)] = box_info[box_info_idx].score;
+            output[OUTPUT_GET_INDEX(output_idx, 2, 0, 0)] =
+                input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 0)];
+            output[OUTPUT_GET_INDEX(output_idx, 3, 0, 0)] =
+                input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 1)];
+            output[OUTPUT_GET_INDEX(output_idx, 4, 0, 0)] =
+                input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 2)];
+            output[OUTPUT_GET_INDEX(output_idx, 5, 0, 0)] =
+                input_boxes[INPUT0_GET_INDEX(box_info[box_info_idx].batch_idx, box_info[box_info_idx].box_idx, 0, 3)];
+
+            selected_indices[INPUT2_GET_INDEX(output_idx, 0, 0, 0)] =
+                box_info[box_info_idx].batch_idx * NUM_BOXES + box_info[box_info_idx].box_idx;
+
+            ++output_idx;
+            ++box_info_idx;
+        }
+
+        // Paddings
+        while (output_idx < (i + 1) * MAX_BOXES_PER_BATCH) {
+            unroll_for(int j = 0; j < 6; ++j) {
+                output[OUTPUT_GET_INDEX(output_idx, j, 0, 0)] = -OUTPUT_VAL_ONE;
+            }
+            selected_indices[INPUT2_GET_INDEX(output_idx, 0, 0, 0)] = -INPUT2_VAL_ONE;
+            ++output_idx;
+        }
+    }
+}
+#endif /* MATRIX_NMS_STAGE_2 */
+
+#undef NUM_BATCHES
+#undef NUM_BOXES
+#undef NUM_CLASSES
+#undef BOX_INFO
diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
index 6287d834264..0a76ed0ed16 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
@@ -90,6 +90,7 @@ enum class KernelType {
     CONVERT_COLOR,
     RANDOM_UNIFORM,
     ADAPTIVE_POOLING,
+    MATRIX_NMS,
     REVERSE,
     PRIOR_BOX,
     EYE,
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp
index 8632e0a1db7..170200261da 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp
@@ -175,6 +175,7 @@ std::string toString(KernelType kt) {
                                           return "BINARY_CONVOLUTION";
         case KernelType::NON_MAX_SUPPRESSION:
                                           return "NON_MAX_SUPPRESSION";
+        case KernelType::MATRIX_NMS:      return "MATRIX_NMS";
         default: return "";
     }
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp
new file mode 100644
index 00000000000..df45a8b815a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.cpp
@@ -0,0 +1,196 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "matrix_nms_kernel_ref.h"
+
+#include <vector>
+
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+ParamsKey MatrixNmsKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT32);
+
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableDifferentTypes();
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    return k;
+}
+
+namespace {
+
+MatrixNmsKernelRef::DispatchData SetDefault(const matrix_nms_params& params, size_t idx) {
+    MatrixNmsKernelRef::DispatchData dispatch_data;
+
+    const auto& input_scores = params.inputs[1];
+    if (idx == 0) {
+        dispatch_data.gws = {input_scores.Batch().v, input_scores.Feature().v, 1};
+        dispatch_data.lws = GetOptimalLocalWorkGroupSizes(dispatch_data.gws, params.engineInfo);
+    } else if (idx == 1) {
+        dispatch_data.gws = {input_scores.Batch().v, 1, 1};
+        dispatch_data.lws = GetOptimalLocalWorkGroupSizes(dispatch_data.gws, params.engineInfo);
+    } else {
+        dispatch_data.gws = {1, 1, 1};
+        dispatch_data.lws = {1, 1, 1};
+    }
+
+    return dispatch_data;
+}
+
+std::tuple<int, int> GetMaxBoxes(const matrix_nms_params& params) {
+    const int classes_num = params.inputs[1].Feature().v;
+    const int boxes_num = params.inputs[0].Feature().v;
+
+    int max_boxes_per_class{boxes_num};
+    if (params.nms_top_k >= 0)
+        max_boxes_per_class = std::min(max_boxes_per_class, params.nms_top_k);
+
+    auto classes_num_adj = classes_num;
+    if (params.background_class >= 0 && params.background_class < classes_num)
+        classes_num_adj = std::max(1, classes_num - 1);
+
+    auto max_boxes_per_batch = max_boxes_per_class * classes_num_adj;
+    if (params.keep_top_k >= 0)
+        max_boxes_per_batch = std::min(max_boxes_per_batch, params.keep_top_k);
+
+    return {max_boxes_per_class, max_boxes_per_batch};
+}
+}  // anonymous namespace
+
+KernelsData MatrixNmsKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    constexpr size_t kernels_num{3};
+    KernelData kernel_data = KernelData::Default<matrix_nms_params>(params, kernels_num);
+    const matrix_nms_params& new_params = dynamic_cast<const matrix_nms_params&>(*kernel_data.params.get());
+
+    constexpr size_t BOX_INFO_SIZE{16};
+
+    const int batches_num = new_params.inputs[1].Batch().v;
+    const int classes_num = new_params.inputs[1].Feature().v;
+
+    int max_boxes_per_class, max_boxes_per_batch;
+    std::tie(max_boxes_per_class, max_boxes_per_batch) = GetMaxBoxes(new_params);
+
+    const size_t box_info_num = batches_num * classes_num * max_boxes_per_class;
+
+    const size_t box_info_buffer_size = box_info_num * BOX_INFO_SIZE;
+    const size_t sel_boxes_num_buffer_size = batches_num * classes_num * sizeof(int);
+
+    kernel_data.internalBufferSizes.push_back(box_info_buffer_size);
+    kernel_data.internalBufferSizes.push_back(sel_boxes_num_buffer_size);
+    kernel_data.internalBufferDataType = Datatype::F32;
+
+    for (size_t i{}; i < kernels_num; ++i) {
+        auto entry_point = GetEntryPoint(kernelName, new_params.layerID, params, options, i);
+        auto jit_constants = GetJitConstants(new_params);
+        jit_constants.AddConstant(MakeJitConstant("MATRIX_NMS_STAGE_" + std::to_string(i), "true"));
+
+        jit_constants.AddConstant(MakeJitConstant("MAX_BOXES_PER_CLASS", max_boxes_per_class));
+        jit_constants.AddConstant(MakeJitConstant("MAX_BOXES_PER_BATCH", max_boxes_per_batch));
+        auto jit = CreateJit(kernelName, jit_constants, entry_point);
+
+        DispatchData dispatch_data = SetDefault(new_params, i);
+        auto& kernel = kernel_data.kernels[i];
+        KernelBase::CheckDispatchData(kernelName, dispatch_data, params.engineInfo.maxWorkGroupSize);
+        kernel.params.workGroups.global = dispatch_data.gws;
+        kernel.params.workGroups.local = dispatch_data.lws;
+        kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo);
+
+        SetKernelArguments(new_params, kernel, i);
+    }
+
+    return {kernel_data};
+}
+
+float MatrixNmsKernelRef::GetKernelsPriority(const Params& params, const optional_params& options) const {
+    return FORCE_PRIORITY_9;
+}
+
+bool MatrixNmsKernelRef::Validate(const Params& p, const optional_params& o) const {
+    if (p.GetType() != KernelType::MATRIX_NMS || o.GetType() != KernelType::MATRIX_NMS) {
+        return false;
+    }
+
+    const matrix_nms_params& params = static_cast<const matrix_nms_params&>(p);
+    // inputs: boxes, scores, second output, third output
+    if (params.inputs.size() != 4)
+        return false;
+
+    return true;
+}
+
+JitConstants MatrixNmsKernelRef::GetJitConstants(const matrix_nms_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    const auto& boxes = params.inputs[0];
+    switch (boxes.GetDType()) {
+    case Datatype::F32:
+        jit.AddConstant(MakeJitConstant("COORD_TYPE_4", "float4"));
+        jit.AddConstant(MakeJitConstant("TINY", "1e-10f"));
+        break;
+
+    case Datatype::F16:
+        jit.AddConstant(MakeJitConstant("COORD_TYPE_4", "half4"));
+        jit.AddConstant(MakeJitConstant("TINY", "1e-7h"));
+        break;
+        break;
+
+    default:
+        throw std::invalid_argument("Matrix NMS boxes type should be one of F32 or F16.");
+    }
+
+    jit.AddConstant(MakeJitConstant("SORT_TYPE", params.sort_type));
+    jit.AddConstant(MakeJitConstant("SORT_RESULT_ACROSS_BATCH", params.sort_result_across_batch));
+    jit.AddConstant(MakeJitConstant("SCORE_THRESHOLD", params.score_threshold));
+    jit.AddConstant(MakeJitConstant("KEEP_TOP_K", params.keep_top_k));
+    jit.AddConstant(MakeJitConstant("BACKGROUND_CLASS", params.background_class));
+    jit.AddConstant(MakeJitConstant("DECAY_FUNC", params.decay));
+    jit.AddConstant(MakeJitConstant("GAUSSIAN_SIGMA", params.gaussian_sigma));
+    jit.AddConstant(MakeJitConstant("POST_THRESHOLD", params.post_threshold));
+    jit.AddConstant(MakeJitConstant("NORM", params.normalized ? "INPUT0_VAL_ZERO" : "INPUT0_VAL_ONE"));
+    return jit;
+}
+
+void MatrixNmsKernelRef::SetKernelArguments(const matrix_nms_params& params, clKernelData& kernel, size_t idx) const {
+    switch (idx) {
+    case 0:
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+        break;
+
+    case 1:
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
+        break;
+
+    case 2:
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+        break;
+
+    default:
+        throw std::invalid_argument("Matrix NMS has 3 kernels. valid index is 0 ~ 2.");
+    }
+}
+
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h
new file mode 100644
index 00000000000..236ac5f6ffe
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_ref.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+
+struct matrix_nms_params : public base_params {
+    matrix_nms_params() : base_params(KernelType::MATRIX_NMS) {}
+
+    enum decay_function { GAUSSIAN, LINEAR };
+
+    enum sort_result_type {
+        CLASS_ID,  // sort selected boxes by class id (ascending) in each batch element
+        SCORE,     // sort selected boxes by score (descending) in each batch element
+        NONE       // do not guarantee the order in each batch element
+    };
+
+    // specifies order of output elements
+    sort_result_type sort_type = sort_result_type::NONE;
+    // specifies whenever it is necessary to sort selected boxes across batches or not
+    bool sort_result_across_batch = false;
+    // specifies minimum score to consider box for the processing
+    float score_threshold = 0.0f;
+    // specifies maximum number of boxes to be selected per class, -1 meaning to
+    // keep all boxes
+    int nms_top_k = -1;
+    // specifies maximum number of boxes to be selected per batch element, -1
+    // meaning to keep all boxes
+    int keep_top_k = -1;
+    // specifies the background class id, -1 meaning to keep all classes
+    int background_class = -1;
+    // specifies decay function used to decay scores
+    decay_function decay = decay_function::LINEAR;
+    // specifies gaussian_sigma parameter for gaussian decay_function
+    float gaussian_sigma = 2.0f;
+    // specifies threshold to filter out boxes with low confidence score after
+    // decaying
+    float post_threshold = 0.0f;
+    // specifies whether boxes are normalized or not
+    bool normalized = true;
+};
+
+struct matrix_nms_optional_params : optional_params {
+    matrix_nms_optional_params() : optional_params(KernelType::MATRIX_NMS) {}
+};
+
+class MatrixNmsKernelRef : public KernelBaseOpenCL {
+public:
+    MatrixNmsKernelRef() : KernelBaseOpenCL("matrix_nms_ref") {}
+
+    using DispatchData = CommonDispatchData;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    JitConstants GetJitConstants(const matrix_nms_params& params) const;
+    bool Validate(const Params& p, const optional_params& o) const override;
+    void SetKernelArguments(const matrix_nms_params& params, clKernelData& kernel, size_t idx) const;
+};
+
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp
new file mode 100644
index 00000000000..b3adceb017a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.cpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "matrix_nms_kernel_selector.h"
+
+#include "matrix_nms_kernel_ref.h"
+
+namespace kernel_selector {
+
+matrix_nms_kernel_selector::matrix_nms_kernel_selector() {
+    Attach<MatrixNmsKernelRef>();
+}
+
+KernelsData matrix_nms_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::MATRIX_NMS);
+}
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h
new file mode 100644
index 00000000000..271a34207ad
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/matrix_nms/matrix_nms_kernel_selector.h
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class matrix_nms_kernel_selector : public kernel_selector_base {
+public:
+    static matrix_nms_kernel_selector& Instance() {
+        static matrix_nms_kernel_selector instance_;
+        return instance_;
+    }
+
+    matrix_nms_kernel_selector();
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp b/src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp
new file mode 100644
index 00000000000..c539e6ca08e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/ops/matrix_nms.cpp
@@ -0,0 +1,83 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "intel_gpu/primitives/matrix_nms.hpp"
+
+#include <memory>
+#include <ngraph/opsets/opset8.hpp>
+
+#include "intel_gpu/plugin/common_utils.hpp"
+#include "intel_gpu/plugin/program.hpp"
+#include "intel_gpu/primitives/mutable_data.hpp"
+#include "ngraph_ops/nms_static_shape_ie.hpp"
+
+namespace ngraph {
+namespace op {
+namespace internal {
+using NmsStaticShapeIE8 = ngraph::op::internal::NmsStaticShapeIE<ngraph::opset8::MatrixNms>;
+}
+}  // namespace op
+}  // namespace ngraph
+
+namespace ov {
+namespace intel_gpu {
+
+namespace {
+void CreateNmsStaticShapeIE8Op(Program& p, const std::shared_ptr<ngraph::op::internal::NmsStaticShapeIE8>& op) {
+    validate_inputs_count(op, {2});
+    auto inputPrimitives = p.GetInputPrimitiveIDs(op);
+
+    std::vector<cldnn::memory::ptr> shared_memory;
+
+    auto outputIndices = op->get_output_shape(0)[0];
+    cldnn::layout mutableLayoutFirst = cldnn::layout(cldnn::element_type_to_data_type(ngraph::element::i32),
+                                                     cldnn::format::bfyx,
+                                                     cldnn::tensor(static_cast<int32_t>(outputIndices), 1, 1, 1));
+
+    shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));
+
+    cldnn::primitive_id matrix_nms_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";
+    auto matrix_nms_mutable_prim_first = cldnn::mutable_data(matrix_nms_mutable_id_w_first, shared_memory.back());
+    p.add_primitive(*op, matrix_nms_mutable_prim_first);
+    inputPrimitives.push_back(matrix_nms_mutable_id_w_first);
+
+    auto batches_num = op->get_output_shape(2)[0];
+    cldnn::layout mutableLayoutSecond = cldnn::layout(cldnn::element_type_to_data_type(ngraph::element::i32),
+                                                      cldnn::format::bfyx,
+                                                      cldnn::tensor(static_cast<int32_t>(batches_num), 1, 1, 1));
+
+    shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));
+
+    cldnn::primitive_id matrix_nms_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
+    auto matrix_nms_mutable_prim_second = cldnn::mutable_data(matrix_nms_mutable_id_w_second, shared_memory.back());
+    p.add_primitive(*op, matrix_nms_mutable_prim_second);
+    inputPrimitives.push_back(matrix_nms_mutable_id_w_second);
+
+    auto matrixNmsLayerName = layer_type_name_ID(op) + ".out0";
+
+    auto prim = cldnn::matrix_nms(matrixNmsLayerName,
+                                  inputPrimitives[0],
+                                  inputPrimitives[1],
+                                  inputPrimitives[inputPrimitives.size() - 2],
+                                  inputPrimitives[inputPrimitives.size() - 1],
+                                  op->get_attrs());
+
+    p.add_primitive(*op, prim);
+
+    cldnn::primitive_id matrix_nms_id_r_first = layer_type_name_ID(op) + ".out1";
+    auto matrix_nms_mutable_prim_r_first =
+        cldnn::mutable_data(matrix_nms_id_r_first, {matrixNmsLayerName}, shared_memory.front());
+    p.add_primitive(*op, matrix_nms_mutable_prim_r_first);
+
+    cldnn::primitive_id matrix_nms_id_r_second = layer_type_name_ID(op) + ".out2";
+    auto matrix_nms_mutable_prim_r_second =
+        cldnn::mutable_data(matrix_nms_id_r_second, {matrixNmsLayerName}, shared_memory.back());
+    p.add_primitive(*op, matrix_nms_mutable_prim_r_second);
+}
+
+}  // anonymous namespace
+
+REGISTER_FACTORY_IMPL(internal, NmsStaticShapeIE8);
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 23930ab618d..e88028a2882 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -70,6 +70,7 @@
 #include <transformations/op_conversions/bidirectional_sequences_decomposition.hpp>
 #include <transformations/op_conversions/convert_previous_nms_to_nms_9.hpp>
 #include <transformations/op_conversions/convert_nms9_to_nms_ie_internal.hpp>
+#include <transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp>
 #include <transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp>
 #include <transformations/op_conversions/convert_gather_downgrade.hpp>
 #include <transformations/op_conversions/convert_gather_0d.hpp>
@@ -165,6 +166,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         manager.register_pass<ngraph::pass::ConvertNMS5ToNMS9>();
         manager.register_pass<ngraph::pass::ConvertNMS9ToNMSIEInternal>();
         manager.register_pass<ngraph::pass::ConvertGP9ToGPIEInternal>();
+        manager.register_pass<ngraph::pass::ConvertMatrixNmsToMatrixNmsIE>();
         manager.register_pass<ngraph::pass::ConvertGather0D>();
         manager.register_pass<ngraph::pass::ConvertPriorBox8To0, false>();
 
diff --git a/src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp
new file mode 100644
index 00000000000..58f82204cf0
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/test_cases/matrix_nms_gpu_test.cpp
@@ -0,0 +1,673 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <intel_gpu/graph/network.hpp>
+#include <intel_gpu/graph/topology.hpp>
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/matrix_nms.hpp>
+#include <intel_gpu/primitives/mutable_data.hpp>
+#include <intel_gpu/runtime/memory.hpp>
+
+#include "test_utils.h"
+
+using namespace cldnn;
+using namespace tests;
+
+namespace {
+
+#define PAD       -1.0
+#define PADI      -1
+#define THRESHOLD 1e-3f
+
+template <class T>
+std::vector<T> convert(const std::vector<float>& v) {
+    return {v.begin(), v.end()};
+}
+
+struct matrix_nms_test_inputs {
+    int num_butches;
+    int num_boxes;
+    int num_classes;
+    int num_selected_boxes;
+    bool sort_result_across_batch;
+    float score_threshold;
+    int nms_top_k;
+    int keep_top_k;
+    int background_class;
+    float gaussian_sigma;
+    float post_threshold;
+    bool normalized;
+    std::vector<float> boxes_values;
+    std::vector<float> scores_values;
+    std::vector<float> expected_output;
+    std::vector<int> expected_selected_boxes;
+    std::vector<int> expected_valid_outputs;
+    matrix_nms::sort_result_type sort_result_type;
+    matrix_nms::decay_function decay_function;
+    std::string test_name;
+};
+
+using matrix_nms_test_params = std::tuple<matrix_nms_test_inputs, format::type>;
+
+template <class T>
+struct matrix_nms_gpu_test : public testing::TestWithParam<matrix_nms_test_params> {
+public:
+    void test() {
+        format::type blocked_format;
+        matrix_nms_test_inputs test_inputs;
+        std::tie(test_inputs, blocked_format) = testing::TestWithParam<matrix_nms_test_params>::GetParam();
+
+        const auto data_type = type_to_data_type<T>::value;
+        const auto plain_format = format::bfyx;
+
+        auto& engine = get_test_engine();
+
+        auto boxes = engine.allocate_memory(
+            {data_type, plain_format, tensor{test_inputs.num_butches, test_inputs.num_boxes, 1, 4}});
+        auto scores = engine.allocate_memory(
+            {data_type,
+             plain_format,
+             tensor{test_inputs.num_butches, test_inputs.num_classes, 1, test_inputs.num_boxes}});
+
+        auto selected_boxes =
+            engine.allocate_memory({data_types::i32, plain_format, tensor{test_inputs.num_selected_boxes, 1, 1, 1}});
+        auto valid_outputs =
+            engine.allocate_memory({data_types::i32, plain_format, tensor{test_inputs.num_butches, 1, 1, 1}});
+
+        set_values(boxes, convert<T>(test_inputs.boxes_values));
+        set_values(scores, convert<T>(test_inputs.scores_values));
+
+        const matrix_nms::attributes attrs(test_inputs.sort_result_type,
+                                           test_inputs.sort_result_across_batch,
+                                           test_inputs.score_threshold,
+                                           test_inputs.nms_top_k,
+                                           test_inputs.keep_top_k,
+                                           test_inputs.background_class,
+                                           test_inputs.decay_function,
+                                           test_inputs.gaussian_sigma,
+                                           test_inputs.post_threshold,
+                                           test_inputs.normalized);
+
+        topology topology;
+        topology.add(input_layout("boxes", boxes->get_layout()));
+        topology.add(input_layout("scores", scores->get_layout()));
+        topology.add(mutable_data("selected_boxes", selected_boxes));
+        topology.add(mutable_data("valid_outputs", valid_outputs));
+
+        topology.add(reorder("reordered_boxes", "boxes", blocked_format, data_type));
+        topology.add(reorder("reordered_scores", "scores", blocked_format, data_type));
+
+        topology.add(matrix_nms("reordered_matrix_nms",
+                                "reordered_boxes",
+                                "reordered_scores",
+                                "selected_boxes",
+                                "valid_outputs",
+                                attrs));
+        topology.add(reorder("matrix_nms", "reordered_matrix_nms", plain_format, data_type));
+
+        network network(engine, topology);
+        network.set_input_data("boxes", boxes);
+        network.set_input_data("scores", scores);
+
+        auto outputs = network.execute();
+
+        auto output = outputs.at("matrix_nms").get_memory();
+        cldnn::mem_lock<T> output_ptr(output, get_test_stream());
+
+        cldnn::mem_lock<int> selected_boxes_ptr(selected_boxes, get_test_stream());
+        cldnn::mem_lock<int> valid_outputs_ptr(valid_outputs, get_test_stream());
+
+        const auto expected_output = convert<T>(test_inputs.expected_output);
+        ASSERT_EQ(expected_output.size(), output_ptr.size());
+        for (size_t i = 0; i < expected_output.size(); ++i) {
+            EXPECT_NEAR(expected_output[i], output_ptr[i], THRESHOLD);
+        }
+
+        ASSERT_EQ(test_inputs.expected_selected_boxes.size(), selected_boxes_ptr.size());
+        for (size_t i = 0; i < test_inputs.expected_selected_boxes.size(); ++i) {
+            EXPECT_EQ(test_inputs.expected_selected_boxes[i], selected_boxes_ptr[i]);
+        }
+
+        ASSERT_EQ(test_inputs.expected_valid_outputs.size(), valid_outputs_ptr.size());
+        for (size_t i = 0; i < test_inputs.expected_valid_outputs.size(); ++i) {
+            EXPECT_EQ(test_inputs.expected_valid_outputs[i], valid_outputs_ptr[i]);
+        }
+    }
+
+    static std::string PrintToStringParamName(const testing::TestParamInfo<matrix_nms_test_params>& info) {
+        auto& test_inputs = std::get<0>(info.param);
+        std::ostringstream result;
+
+        auto sort_res_type_str =
+            test_inputs.sort_result_type == matrix_nms::sort_result_type::score
+                ? "score"
+                : test_inputs.sort_result_type == matrix_nms::sort_result_type::class_id ? "class_id" : "none";
+        auto decay_function_str =
+            test_inputs.decay_function == matrix_nms::decay_function::linear
+                ? "linear"
+                : test_inputs.decay_function == matrix_nms::decay_function::gaussian ? "gaussian" : "none";
+
+        result << "SortResultAcrossBatch=" << bool_to_str(test_inputs.sort_result_across_batch) << "_";
+        result << "ScoreThreshold=" << test_inputs.score_threshold << "_";
+        result << "NmsTopK=" << test_inputs.nms_top_k << "_";
+        result << "KeepTopK=" << test_inputs.keep_top_k << "_";
+        result << "BackgroundClass=" << test_inputs.background_class << "_";
+        result << "GaussianSigma=" << test_inputs.gaussian_sigma << "_";
+        result << "PostThreshold=" << test_inputs.post_threshold << "_";
+        result << "Normalized=" << bool_to_str(test_inputs.normalized) << "_";
+        result << "sort_result_type=" << sort_res_type_str << "_";
+        result << "decay_function=" << decay_function_str << "_";
+        result << "Format=" << fmt_to_str(std::get<1>(info.param));
+
+        if (!test_inputs.test_name.empty())
+            result << "_TN=" << test_inputs.test_name;
+
+        return result.str();
+    }
+};
+
+matrix_nms_test_inputs get_matrix_nms_smoke_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            2,      // num_classes
+            3,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            0,      // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},  // scores
+            std::vector<float>{1.00,                                                              // expected_output
+                               0.95,
+                               0.00,
+                               0.00,
+                               1.00,
+                               1.00,
+                               1.00,
+                               0.8,
+                               0.00,
+                               10.00,
+                               1.00,
+                               11.00,
+                               1.00,
+                               0.13636364,
+                               0.0,
+                               0.1,
+                               1.0,
+                               1.1},
+            std::vector<int>{0, 3, 1},          // expected_selected_boxes
+            std::vector<int>{3},                // expected_valid_output
+            matrix_nms::sort_result_type::score,  // sort_result_type
+            matrix_nms::decay_function::linear,  // decay_function
+            "smoke"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_gaussian_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            2,      // num_classes
+            3,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            0,      // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},  // scores
+            std::vector<float>{1.00,                                                              // expected_output
+                               0.95,
+                               0.00,
+                               0.00,
+                               1.00,
+                               1.00,
+                               1.00,
+                               0.8,
+                               0.00,
+                               10.00,
+                               1.00,
+                               11.00,
+                               1.00,
+                               0.1966116,
+                               0.0,
+                               0.1,
+                               1.0,
+                               1.1},
+            std::vector<int>{0, 3, 1},            // expected_selected_boxes
+            std::vector<int>{3},                  // expected_valid_output
+            matrix_nms::sort_result_type::score,    // sort_result_type
+            matrix_nms::decay_function::gaussian,  // decay_function
+            "gaussian"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_two_batches_two_classes_inputs() {
+    return {2,      // num_butches
+            6,      // num_boxes
+            2,      // num_classes
+            6,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            0,      // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0,
+                               0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3,  // scores
+                               0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},
+            std::vector<float>{1.00, 0.95,  0.00, 0.00,  1.00, 1.00,  // expected_output
+                               1.00, 0.8,   0.00, 10.00, 1.00, 11.00,      1.00, 0.13636364, 0.0,  0.1,
+                               1.0,  1.1,   1.00, 0.95,  0.00, 0.00,       1.00, 1.00,       1.00, 0.8,
+                               0.00, 10.00, 1.00, 11.00, 1.00, 0.13636364, 0.0,  0.1,        1.0,  1.1},
+            std::vector<int>{0, 3, 1, 6, 9, 7},  // expected_selected_boxes
+            std::vector<int>{3, 3},              // expected_valid_output
+            matrix_nms::sort_result_type::score,   // sort_result_type
+            matrix_nms::decay_function::linear,   // decay_function
+            "two_batches_two_classes"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_two_batches_two_classes_by_score_cross_batch_inputs() {
+    return {2,     // num_butches
+            6,     // num_boxes
+            2,     // num_classes
+            12,    // num_selected_boxes
+            true,  // sort_result_across_bch
+            0.0f,  // score_threshold
+            3,     // nms_top_k
+            -1,    // keep_top_k
+            -1,    // background_class
+            2.0f,  // gaussian_sigma
+            0.5f,  // post_threshold
+            true,  // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0,
+                               0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3,  // scores
+                               0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},
+            std::vector<float>{0.00, 0.95, 0.00, 10.00, 1.00, 11.00,  // expected_output
+                               1.00, 0.95, 0.00, 0.00,  1.00, 1.00,  0.00, 0.95, 0.00, 10.00, 1.00, 11.00, 1.00, 0.95,
+                               0.00, 0.00, 1.00, 1.00,  PAD,  PAD,   PAD,  PAD,  PAD,  PAD,   PAD,  PAD,   PAD,  PAD,
+                               PAD,  PAD,  0.00, 0.90,  0.00, 0.00,  1.00, 1.00, 0.00, 0.90,  0.00, 0.00,  1.00, 1.00,
+                               1.00, 0.80, 0.00, 10.00, 1.00, 11.00, 1.00, 0.80, 0.00, 10.00, 1.00, 11.00, PAD,  PAD,
+                               PAD,  PAD,  PAD,  PAD,   PAD,  PAD,   PAD,  PAD,  PAD,  PAD},
+            std::vector<int>{3, 0, 9, 6, PADI, PADI, 0, 6, 3, 9, PADI, PADI},  // expected_selected_boxes
+            std::vector<int>{4, 4},                                            // expected_valid_output
+            matrix_nms::sort_result_type::score,                                 // sort_result_type
+            matrix_nms::decay_function::linear,                                 // decay_function
+            "two_batches_two_classes_by_score_cross_batch"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_two_batches_two_classes_by_classid_cross_batch_inputs() {
+    return {2,     // num_butches
+            6,     // num_boxes
+            2,     // num_classes
+            12,    // num_selected_boxes
+            true,  // sort_result_across_bch
+            0.0f,  // score_threshold
+            3,     // nms_top_k
+            -1,    // keep_top_k
+            -1,    // background_class
+            2.0f,  // gaussian_sigma
+            0.5f,  // post_threshold
+            true,  // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0,
+                               0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3,  // scores
+                               0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},
+            std::vector<float>{0.00, 0.95, 0.00, 10.00, 1.00, 11.00,  // expected_output
+                               0.00, 0.90, 0.00, 0.00,  1.00, 1.00,  0.00, 0.95, 0.00, 10.00, 1.00, 11.00, 0.00, 0.90,
+                               0.00, 0.00, 1.00, 1.00,  PAD,  PAD,   PAD,  PAD,  PAD,  PAD,   PAD,  PAD,   PAD,  PAD,
+                               PAD,  PAD,  1.00, 0.95,  0.00, 0.00,  1.00, 1.00, 1.00, 0.80,  0.00, 10.00, 1.00, 11.00,
+                               1.00, 0.95, 0.00, 0.00,  1.00, 1.00,  1.00, 0.80, 0.00, 10.00, 1.00, 11.00, PAD,  PAD,
+                               PAD,  PAD,  PAD,  PAD,   PAD,  PAD,   PAD,  PAD,  PAD,  PAD},
+            std::vector<int>{3, 0, 9, 6, PADI, PADI, 0, 3, 6, 9, PADI, PADI},  // expected_selected_boxes
+            std::vector<int>{4, 4},                                            // expected_valid_output
+            matrix_nms::sort_result_type::class_id,                              // sort_result_type
+            matrix_nms::decay_function::linear,                                 // decay_function
+            "matrix_nms_two_batches_two_classes_by_classid_cross_batch"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_by_keep_top_k_inputs() {
+    return {2,      // num_butches
+            6,      // num_boxes
+            2,      // num_classes
+            6,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            3,      // keep_top_k
+            0,      // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0,
+                               0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3,  // scores
+                               0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},
+            std::vector<float>{1.00, 0.95,  0.00, 0.00,  1.00, 1.00,  // expected_output
+                               1.00, 0.8,   0.00, 10.00, 1.00, 11.00,      1.00, 0.13636364, 0.0,  0.1,
+                               1.0,  1.1,   1.00, 0.95,  0.00, 0.00,       1.00, 1.00,       1.00, 0.8,
+                               0.00, 10.00, 1.00, 11.00, 1.00, 0.13636364, 0.0,  0.1,        1.0,  1.1},
+            std::vector<int>{0, 3, 1, 6, 9, 7},    // expected_selected_boxes
+            std::vector<int>{3, 3},                // expected_valid_output
+            matrix_nms::sort_result_type::class_id,  // sort_result_type
+            matrix_nms::decay_function::linear,     // decay_function
+            "matrix_nms_by_keep_top_k"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_background_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            2,      // num_classes
+            6,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            -1,     // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3, 0.95, 0.75, 0.6, 0.80, 0.5, 0.3},  // scores
+            std::vector<float>{0.00, 0.95, 0.0,  10.0, 1.0,  11.0,                                // expected_output
+                               1.00, 0.95, 0.0,  0.0,  1.0,  1.0,        0.00, 0.9,  0.0,  0.0,
+                               1.0,  1.0,  1.00, 0.8,  0.0,  10.0,       1.0,  11.0, 0.00, 0.13636364,
+                               0.0,  0.1,  1.0,  1.1,  1.00, 0.13636364, 0.0,  0.1,  1.0,  1.1},
+            std::vector<int>{3, 0, 0, 3, 1, 1},  // expected_selected_boxes
+            std::vector<int>{6},                 // expected_valid_output
+            matrix_nms::sort_result_type::score,   // sort_result_type
+            matrix_nms::decay_function::linear,   // decay_function
+            "matrix_nms_background"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_flipped_coordinates_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            1,      // num_classes
+            3,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            -1,     // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{1.0, 1.0,  0.0, 0.0,  0.0, 0.1,  1.0, 1.1,  0.0, 0.9,   1.0, -0.1,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 1.0, 10.1, 0.0, 11.1, 1.0, 101.0, 0.0, 100.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3},  // scores
+            std::vector<float>{0.00,
+                               0.95,
+                               0.0,
+                               10.0,
+                               1.0,
+                               11.0,  // expected_output
+                               0.00,
+                               0.9,
+                               1.0,
+                               1.0,
+                               0.0,
+                               0.0,
+                               0.00,
+                               0.75,
+                               0.0,
+                               0.1,
+                               1.0,
+                               1.1},
+            std::vector<int>{3, 0, 1},          // expected_selected_boxes
+            std::vector<int>{3},                // expected_valid_output
+            matrix_nms::sort_result_type::score,  // sort_result_type
+            matrix_nms::decay_function::linear,  // decay_function
+            "flipped_coordinates"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_post_threshold_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            1,      // num_classes
+            3,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            -1,     // background_class
+            2.0f,   // gaussian_sigma
+            0.8f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3},  // scores
+            std::vector<float>{0.00,
+                               0.95,
+                               0.00,
+                               10.00,
+                               1.00,
+                               11.00,  // expected_output
+                               0.00,
+                               0.9,
+                               0.00,
+                               0.00,
+                               1.00,
+                               1.00,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD},
+            std::vector<int>{3, 0, PADI},       // expected_selected_boxes
+            std::vector<int>{2},                // expected_valid_output
+            matrix_nms::sort_result_type::score,  // sort_result_type
+            matrix_nms::decay_function::linear,  // decay_function
+            "post_threshold"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_identical_boxes_inputs() {
+    return {1,      // num_butches
+            10,     // num_boxes
+            1,      // num_classes
+            3,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            -1,     // background_class
+            2.0f,   // gaussian_sigma
+            0.3f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0,  // boxes
+                               1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,
+                               0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0},
+            std::vector<float>{0.4, 0.01, 0.2, 0.09, 0.15, 0.05, 0.02, 0.03, 0.05, 0.0},  // scores
+            std::vector<float>{0.00,
+                               0.40,
+                               0.00,
+                               0.00,
+                               1.00,
+                               1.00,  // expected_output
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD},
+            std::vector<int>{0, PADI, PADI},    // expected_selected_boxes
+            std::vector<int>{1},                // expected_valid_output
+            matrix_nms::sort_result_type::score,  // sort_result_type
+            matrix_nms::decay_function::linear,  // decay_function
+            "identical_boxes"};
+};
+
+matrix_nms_test_inputs get_matrix_nms_top_k_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            1,      // num_classes
+            2,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            0.0f,   // score_threshold
+            2,      // nms_top_k
+            -1,     // keep_top_k
+            -1,     // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3},  // scores
+            std::vector<float>{0.00,
+                               0.95,
+                               0.00,
+                               10.00,
+                               1.00,
+                               11.00,  // expected_output
+                               0.00,
+                               0.90,
+                               0.00,
+                               0.00,
+                               1.00,
+                               1.00},
+            std::vector<int>{3, 0},             // expected_selected_boxes
+            std::vector<int>{2},                // expected_valid_output
+            matrix_nms::sort_result_type::score,  // sort_result_type
+            matrix_nms::decay_function::linear,  // decay_function
+            "matrix_nms_nms_top_k"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_single_box_inputs() {
+    return {1,                                                       // num_butches
+            1,                                                       // num_boxes
+            1,                                                       // num_classes
+            1,                                                       // num_selected_boxes
+            false,                                                   // sort_result_across_bch
+            0.0f,                                                    // score_threshold
+            3,                                                       // nms_top_k
+            -1,                                                      // keep_top_k
+            -1,                                                      // background_class
+            2.0f,                                                    // gaussian_sigma
+            0.0f,                                                    // post_threshold
+            true,                                                    // normalized
+            std::vector<float>{0.0, 0.0, 1.0, 1.0},                  // boxes
+            std::vector<float>{0.9},                                 // scores
+            std::vector<float>{0.00, 0.90, 0.00, 0.00, 1.00, 1.00},  // expected_output
+            std::vector<int>{0},                                     // expected_selected_boxes
+            std::vector<int>{1},                                     // expected_valid_output
+            matrix_nms::sort_result_type::score,                       // sort_result_type
+            matrix_nms::decay_function::linear,                       // decay_function
+            "matrix_nms_single_box"};
+}
+
+matrix_nms_test_inputs get_matrix_nms_no_output_inputs() {
+    return {1,      // num_butches
+            6,      // num_boxes
+            1,      // num_classes
+            3,      // num_selected_boxes
+            false,  // sort_result_across_bch
+            2.0f,   // score_threshold
+            3,      // nms_top_k
+            -1,     // keep_top_k
+            -1,     // background_class
+            2.0f,   // gaussian_sigma
+            0.0f,   // post_threshold
+            true,   // normalized
+            std::vector<float>{0.0, 0.0,  1.0, 1.0,  0.0, 0.1,  1.0, 1.1,  0.0, -0.1,  1.0, 0.9,  // boxes
+                               0.0, 10.0, 1.0, 11.0, 0.0, 10.1, 1.0, 11.1, 0.0, 100.0, 1.0, 101.0},
+            std::vector<float>{0.9, 0.75, 0.6, 0.95, 0.5, 0.3},  // scores
+            std::vector<float>{PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,  // expected_output
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD,
+                               PAD},
+            std::vector<int>{PADI, PADI, PADI},  // expected_selected_boxes
+            std::vector<int>{0},                 // expected_valid_output
+            matrix_nms::sort_result_type::score,   // sort_result_type
+            matrix_nms::decay_function::linear,   // decay_function
+            "matrix_nms_no_output"};
+}
+
+const std::vector<format::type> layout_formats = {format::bfyx,
+                                                  format::b_fs_yx_fsv16,
+                                                  format::b_fs_yx_fsv32,
+                                                  format::bs_fs_yx_bsv16_fsv16,
+                                                  format::bs_fs_yx_bsv32_fsv32,
+                                                  format::bs_fs_yx_bsv32_fsv16};
+
+#define INSTANTIATE_MATRIX_NMS_TEST_SUITE(input_type, func)                                                \
+    using matrix_nms_gpu_test_##input_type##func = matrix_nms_gpu_test<input_type>;                        \
+    TEST_P(matrix_nms_gpu_test_##input_type##func, test) {                                                 \
+        test();                                                                                            \
+    }                                                                                                      \
+    INSTANTIATE_TEST_SUITE_P(matrix_nms_test_##input_type##func,                                           \
+                             matrix_nms_gpu_test_##input_type##func,                                       \
+                             testing::Combine(testing::Values(func()), testing::ValuesIn(layout_formats)), \
+                             matrix_nms_gpu_test_##input_type##func::PrintToStringParamName);
+
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_smoke_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_gaussian_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_two_batches_two_classes_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_two_batches_two_classes_by_classid_cross_batch_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_two_batches_two_classes_by_score_cross_batch_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_by_keep_top_k_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_background_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_flipped_coordinates_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_post_threshold_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_identical_boxes_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_top_k_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_single_box_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(float, get_matrix_nms_no_output_inputs)
+
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_smoke_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_gaussian_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_two_batches_two_classes_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_by_keep_top_k_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_two_batches_two_classes_by_classid_cross_batch_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_two_batches_two_classes_by_score_cross_batch_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_background_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_flipped_coordinates_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_post_threshold_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_identical_boxes_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_top_k_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_single_box_inputs)
+INSTANTIATE_MATRIX_NMS_TEST_SUITE(FLOAT16, get_matrix_nms_no_output_inputs)
+
+#undef INSTANTIATE_MATRIX_NMS_TEST_SUITE
+
+}  // namespace
diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp
new file mode 100644
index 00000000000..fd615903646
--- /dev/null
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/matrix_nms.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/matrix_nms.hpp"
+
+#include <tuple>
+#include <vector>
+
+#include "common_test_utils/test_constants.hpp"
+
+using namespace ov::test::subgraph;
+using namespace InferenceEngine;
+using namespace ngraph;
+
+const std::vector<std::vector<ov::Shape>> inStaticShapeParams = {{{3, 100, 4}, {3, 1, 100}},
+                                                                 {{1, 10, 4}, {1, 100, 10}}};
+
+const auto inputPrecisions = InputPrecisions{ov::element::f32, ov::element::i32, ov::element::f32};
+
+const std::vector<op::v8::MatrixNms::SortResultType> sortResultType = {op::v8::MatrixNms::SortResultType::CLASSID,
+                                                                       op::v8::MatrixNms::SortResultType::SCORE,
+                                                                       op::v8::MatrixNms::SortResultType::NONE};
+const std::vector<element::Type> outType = {element::i32, element::i64};
+const std::vector<TopKParams> topKParams = {TopKParams{-1, 5}, TopKParams{100, -1}};
+const std::vector<ThresholdParams> thresholdParams = {ThresholdParams{0.0f, 2.0f, 0.0f},
+                                                      ThresholdParams{0.1f, 1.5f, 0.2f}};
+const std::vector<int> backgroudClass = {-1, 1};
+const std::vector<bool> normalized = {true, false};
+const std::vector<op::v8::MatrixNms::DecayFunction> decayFunction = {op::v8::MatrixNms::DecayFunction::GAUSSIAN,
+                                                                     op::v8::MatrixNms::DecayFunction::LINEAR};
+
+const auto nmsParamsStatic =
+    ::testing::Combine(::testing::ValuesIn(ov::test::static_shapes_to_test_representation(inStaticShapeParams)),
+                       ::testing::Values(inputPrecisions),
+                       ::testing::ValuesIn(sortResultType),
+                       ::testing::ValuesIn(outType),
+                       ::testing::ValuesIn(topKParams),
+                       ::testing::ValuesIn(thresholdParams),
+                       ::testing::ValuesIn(backgroudClass),
+                       ::testing::ValuesIn(normalized),
+                       ::testing::ValuesIn(decayFunction),
+                       ::testing::Values(CommonTestUtils::DEVICE_GPU));
+
+INSTANTIATE_TEST_SUITE_P(smoke_MatrixNmsLayerTest_static,
+                         MatrixNmsLayerTest,
+                         nmsParamsStatic,
+                         MatrixNmsLayerTest::getTestCaseName);
diff --git a/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp b/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp
index 3c1e2c418e9..6d8f3c79de0 100644
--- a/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/matrix_nms.cpp
@@ -325,7 +325,10 @@ void MatrixNmsLayerTest::SetUp() {
     const auto paramOuts =
             ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
     auto nms = std::make_shared<opset8::MatrixNms>(paramOuts[0], paramOuts[1], m_attrs);
-    if (!m_outStaticShape) {
+
+    if (targetDevice == CommonTestUtils::DEVICE_GPU) {
+        function = std::make_shared<Function>(nms, params, "MatrixNMS");
+    } else if (!m_outStaticShape) {
         auto result = std::make_shared<opset5::Result>(nms);
         function = std::make_shared<Function>(result, params, "MatrixNMS");
     } else {