[CPU] Add support 4th and 5th input DetectionOutput (#1290)

* [CPU] Add support 4th and 5th input DetectionOutput * fix any comments * move reference to ngraph * some changes for mx nms * change namespace for ref impl
2020-08-07 09:05:41 +03:00 · 2020-08-07 09:05:41 +03:00 · f9023ff7da
commit f9023ff7da
parent 8c118ef8b2
12 changed files with 1140 additions and 46 deletions
--- a/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/detectionoutput.cpp
@ -26,7 +26,7 @@ class DetectionOutputImpl: public ExtLayerBase {
 public:
    explicit DetectionOutputImpl(const CNNLayer* layer) {
        try {
-            if (layer->insData.size() != 3)
+            if (layer->insData.size() != 3 && layer->insData.size() != 5)
                THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << layer->name;
            if (layer->outData.empty())
                THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << layer->name;
@ -50,6 +50,9 @@ public:
            _offset = _normalized ? 0 : 1;
            _num_loc_classes = _share_location ? 1 : _num_classes;

+            with_add_box_pred = layer->insData.size() == 5;
+            _objectness_score = layer->GetParamAsFloat("objectness_score", 0.0f);
+
            std::string code_type_str = layer->GetParamAsString("code_type", "caffe.PriorBoxParameter.CORNER");
            _code_type = (code_type_str == "caffe.PriorBoxParameter.CENTER_SIZE" ? CodeType::CENTER_SIZE
                                                                                 : CodeType::CORNER);
@ -109,9 +112,8 @@ public:
            _num_priors_actual = InferenceEngine::make_shared_blob<int>({Precision::I32, num_priors_actual_size, C});
            _num_priors_actual->allocate();

-            addConfig(layer, {DataConfigurator(ConfLayout::PLN),
-                       DataConfigurator(ConfLayout::PLN),
-                       DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
+            std::vector<DataConfigurator> in_data_conf(layer->insData.size(), DataConfigurator(ConfLayout::PLN));
+            addConfig(layer, in_data_conf, {DataConfigurator(ConfLayout::PLN)});
        } catch (InferenceEngine::details::InferenceEngineException &ex) {
            errorMsg = ex.what();
        }
@ -121,51 +123,81 @@ public:
                       ResponseDesc *resp) noexcept override {
        float *dst_data = outputs[0]->buffer();

-        const float *loc_data    = inputs[idx_location]->buffer();
-        const float *conf_data   = inputs[idx_confidence]->buffer();
-        const float *prior_data  = inputs[idx_priors]->buffer();
+        const float *loc_data    = inputs[idx_location]->buffer().as<const float *>();
+        const float *conf_data   = inputs[idx_confidence]->buffer().as<const float *>();
+        const float *prior_data  = inputs[idx_priors]->buffer().as<const float *>();
+        const float *arm_conf_data = inputs.size() > 3 ? inputs[idx_arm_confidence]->buffer().as<const float *>() : nullptr;
+        const float *arm_loc_data = inputs.size() > 4 ? inputs[idx_arm_location]->buffer().as<const float *>() : nullptr;

        const int N = inputs[idx_confidence]->getTensorDesc().getDims()[0];

-        float *decoded_bboxes_data = _decoded_bboxes->buffer();
-        float *reordered_conf_data = _reordered_conf->buffer();
-        float *bbox_sizes_data     = _bbox_sizes->buffer();
-        int *detections_data       = _detections_count->buffer();
-        int *buffer_data           = _buffer->buffer();
-        int *indices_data          = _indices->buffer();
-        int *num_priors_actual     = _num_priors_actual->buffer();
+        float *decoded_bboxes_data = _decoded_bboxes->buffer().as<float *>();
+        float *reordered_conf_data = _reordered_conf->buffer().as<float *>();
+        float *bbox_sizes_data     = _bbox_sizes->buffer().as<float *>();
+        int *detections_data       = _detections_count->buffer().as<int *>();
+        int *buffer_data           = _buffer->buffer().as<int *>();
+        int *indices_data          = _indices->buffer().as<int *>();
+        int *num_priors_actual     = _num_priors_actual->buffer().as<int *>();

        for (int n = 0; n < N; ++n) {
            const float *ppriors = prior_data;
            const float *prior_variances = prior_data + _num_priors*_prior_size;
            if (_priors_batches) {
                ppriors += _variance_encoded_in_target ? n*_num_priors*_prior_size : 2*n*_num_priors*_prior_size;
-                prior_variances += _variance_encoded_in_target ? 0 : n*_num_priors*_prior_size;
+                prior_variances += _variance_encoded_in_target ? 0 : 2*n*_num_priors*_prior_size;
            }

            if (_share_location) {
                const float *ploc = loc_data + n*4*_num_priors;
                float *pboxes = decoded_bboxes_data + n*4*_num_priors;
                float *psizes = bbox_sizes_data + n*_num_priors;
-                decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n);
+
+                if (with_add_box_pred) {
+                    const float *p_arm_loc = arm_loc_data + n*4*_num_priors;
+                    decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                    decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false);
+                } else {
+                    decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                }
            } else {
                for (int c = 0; c < _num_loc_classes; ++c) {
                    if (c == _background_label_id) {
                        continue;
                    }
-
                    const float *ploc = loc_data + n*4*_num_loc_classes*_num_priors + c*4;
                    float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors + c*4*_num_priors;
                    float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors + c*_num_priors;
-                    decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n);
+                    if (with_add_box_pred) {
+                        const float *p_arm_loc = arm_loc_data + n*4*_num_loc_classes*_num_priors + c*4;
+                        decodeBBoxes(ppriors, p_arm_loc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                        decodeBBoxes(pboxes, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, 0, 4, false);
+                    } else {
+                        decodeBBoxes(ppriors, ploc, prior_variances, pboxes, psizes, num_priors_actual, n, _offset, _prior_size);
+                    }
                }
            }
        }

-        for (int n = 0; n < N; ++n) {
-            for (int c = 0; c < _num_classes; ++c) {
+        if (with_add_box_pred) {
+            for (int n = 0; n < N; ++n) {
                for (int p = 0; p < _num_priors; ++p) {
-                    reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
+                    if (arm_conf_data[n*_num_priors*2 + p * 2 + 1] < _objectness_score) {
+                        for (int c = 0; c < _num_classes; ++c) {
+                            reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = c == _background_label_id ? 1.0f : 0.0f;
+                        }
+                    } else {
+                        for (int c = 0; c < _num_classes; ++c) {
+                            reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
+                        }
+                    }
+                }
+            }
+        } else {
+            for (int n = 0; n < N; ++n) {
+                for (int c = 0; c < _num_classes; ++c) {
+                    for (int p = 0; p < _num_priors; ++p) {
+                        reordered_conf_data[n*_num_priors*_num_classes + c*_num_priors + p] = conf_data[n*_num_priors*_num_classes + p*_num_classes + c];
+                    }
                }
            }
        }
@ -204,8 +236,8 @@ public:
                int *pdetections = detections_data + n*_num_classes;

                const float *pconf = reordered_conf_data + n*_num_classes*_num_priors;
-                const float *pboxes = decoded_bboxes_data + n*4*_num_priors;
-                const float *psizes = bbox_sizes_data + n*_num_priors;
+                const float *pboxes = decoded_bboxes_data + n*4*_num_loc_classes*_num_priors;
+                const float *psizes = bbox_sizes_data + n*_num_loc_classes*_num_priors;

                nms_mx(pconf, pboxes, psizes, pbuffer, pindices, pdetections, _num_priors);
            }
@ -220,6 +252,7 @@ public:
                for (int c = 0; c < _num_classes; ++c) {
                    int detections = detections_data[n*_num_classes + c];
                    int *pindices = indices_data + n*_num_classes*_num_priors + c*_num_priors;
+
                    float *pconf  = reordered_conf_data + n*_num_classes*_num_priors + c*_num_priors;

                    for (int i = 0; i < detections; ++i) {
@ -310,7 +343,8 @@ private:
    const int idx_location = 0;
    const int idx_confidence = 1;
    const int idx_priors = 2;
-
+    const int idx_arm_confidence = 3;
+    const int idx_arm_location = 4;

    int _num_classes = 0;
    int _background_label_id = 0;
@ -324,6 +358,8 @@ private:
    bool _clip_after_nms    = false;  // clip bounding boxes after nms step
    bool _decrease_label_id = false;

+    bool with_add_box_pred = false;
+
    int _image_width = 0;
    int _image_height = 0;
    int _prior_size = 4;
@ -332,6 +368,7 @@ private:

    float _nms_threshold = 0.0f;
    float _confidence_threshold = 0.0f;
+    float _objectness_score = 0.0f;

    int _num = 0;
    int _num_loc_classes = 0;
@ -344,7 +381,8 @@ private:
    };

    void decodeBBoxes(const float *prior_data, const float *loc_data, const float *variance_data,
-                      float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n);
+                      float *decoded_bboxes, float *decoded_bbox_sizes, int* num_priors_actual, int n, const int& offs, const int& pr_size,
+                      bool decodeType = true); // after ARM = false

    void nms_cf(const float *conf_data, const float *bboxes, const float *sizes,
                int *buffer, int *indices, int &detections, int num_priors_actual);
@ -384,8 +422,8 @@ static inline float JaccardOverlap(const float *decoded_bbox,

    float xmin2 = decoded_bbox[idx2*4 + 0];
    float ymin2 = decoded_bbox[idx2*4 + 1];
-    float ymax2 = decoded_bbox[idx2*4 + 3];
    float xmax2 = decoded_bbox[idx2*4 + 2];
+    float ymax2 = decoded_bbox[idx2*4 + 3];

    if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
        return 0.0f;
@ -411,34 +449,36 @@ static inline float JaccardOverlap(const float *decoded_bbox,
 }

 void DetectionOutputImpl::decodeBBoxes(const float *prior_data,
-                                   const float *loc_data,
-                                   const float *variance_data,
-                                   float *decoded_bboxes,
-                                   float *decoded_bbox_sizes,
-                                   int* num_priors_actual,
-                                   int n) {
+                                       const float *loc_data,
+                                       const float *variance_data,
+                                       float *decoded_bboxes,
+                                       float *decoded_bbox_sizes,
+                                       int* num_priors_actual,
+                                       int n,
+                                       const int& offs,
+                                       const int& pr_size,
+                                       bool decodeType) {
    num_priors_actual[n] = _num_priors;
-    if (!_normalized) {
+    if (!_normalized && decodeType) {
        int num = 0;
        for (; num < _num_priors; ++num) {
-            float batch_id = prior_data[num * _prior_size + 0];
+            float batch_id = prior_data[num * pr_size + 0];
            if (batch_id == -1.f) {
                num_priors_actual[n] = num;
                break;
            }
        }
    }
-
    parallel_for(num_priors_actual[n], [&](int p) {
        float new_xmin = 0.0f;
        float new_ymin = 0.0f;
        float new_xmax = 0.0f;
        float new_ymax = 0.0f;

-        float prior_xmin = prior_data[p*_prior_size + 0 + _offset];
-        float prior_ymin = prior_data[p*_prior_size + 1 + _offset];
-        float prior_xmax = prior_data[p*_prior_size + 2 + _offset];
-        float prior_ymax = prior_data[p*_prior_size + 3 + _offset];
+        float prior_xmin = prior_data[p*pr_size + 0 + offs];
+        float prior_ymin = prior_data[p*pr_size + 1 + offs];
+        float prior_xmax = prior_data[p*pr_size + 2 + offs];
+        float prior_ymax = prior_data[p*pr_size + 3 + offs];

        float loc_xmin = loc_data[4*p*_num_loc_classes + 0];
        float loc_ymin = loc_data[4*p*_num_loc_classes + 1];
@ -591,7 +631,12 @@ void DetectionOutputImpl::nms_mx(const float* conf_data,
        bool keep = true;
        for (int k = 0; k < ndetection; ++k) {
            const int kept_idx = pindices[k];
-            float overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx);
+            float overlap = 0.0f;
+            if (_share_location) {
+                overlap = JaccardOverlap(bboxes, sizes, prior, kept_idx);
+            } else {
+                overlap = JaccardOverlap(bboxes, sizes, cls*_num_priors + prior, cls*_num_priors + kept_idx);
+            }
            if (overlap > _nms_threshold) {
                keep = false;
                break;
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/detection_output.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/detection_output.cpp
@ -0,0 +1,85 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/detection_output.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const int numClasses = 11;
+const int backgroundLabelId = 0;
+const std::vector<int> topK = {75};
+const std::vector<std::vector<int>> keepTopK = { {50}, {100} };
+const std::vector<std::string> codeType = {"caffe.PriorBoxParameter.CORNER", "caffe.PriorBoxParameter.CENTER_SIZE"};
+const float nmsThreshold = 0.5f;
+const float confidenceThreshold = 0.3f;
+const std::vector<bool> clipAfterNms = {true, false};
+const std::vector<bool> clipBeforeNms = {true, false};
+const std::vector<bool> decreaseLabelId = {true, false};
+const float objectnessScore = 0.4f;
+const std::vector<size_t> numberBatch = {1, 2};
+
+const auto commonAttributes = ::testing::Combine(
+        ::testing::Values(numClasses),
+        ::testing::Values(backgroundLabelId),
+        ::testing::ValuesIn(topK),
+        ::testing::ValuesIn(keepTopK),
+        ::testing::ValuesIn(codeType),
+        ::testing::Values(nmsThreshold),
+        ::testing::Values(confidenceThreshold),
+        ::testing::ValuesIn(clipAfterNms),
+        ::testing::ValuesIn(clipBeforeNms),
+        ::testing::ValuesIn(decreaseLabelId)
+);
+
+/* =============== 3 inputs cases =============== */
+
+const std::vector<ParamsWhichSizeDepends> specificParams3In = {
+    ParamsWhichSizeDepends{true, true, true, 1, 1, {1, 60}, {1, 165}, {1, 1, 60}, {}, {}},
+    ParamsWhichSizeDepends{true, false, true, 1, 1, {1, 660}, {1, 165}, {1, 1, 60}, {}, {}},
+    ParamsWhichSizeDepends{false, true, true, 1, 1, {1, 60}, {1, 165}, {1, 2, 60}, {}, {}},
+    ParamsWhichSizeDepends{false, false, true, 1, 1, {1, 660}, {1, 165}, {1, 2, 60}, {}, {}},
+
+    ParamsWhichSizeDepends{true, true, false, 10, 10, {1, 60}, {1, 165}, {1, 1, 75}, {}, {}},
+    ParamsWhichSizeDepends{true, false, false, 10, 10, {1, 660}, {1, 165}, {1, 1, 75}, {}, {}},
+    ParamsWhichSizeDepends{false, true, false, 10, 10, {1, 60}, {1, 165}, {1, 2, 75}, {}, {}},
+    ParamsWhichSizeDepends{false, false, false, 10, 10, {1, 660}, {1, 165}, {1, 2, 75}, {}, {}}
+};
+
+const auto params3Inputs = ::testing::Combine(
+        commonAttributes,
+        ::testing::ValuesIn(specificParams3In),
+        ::testing::ValuesIn(numberBatch),
+        ::testing::Values(0.0f),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_DetectionOutput3In, DetectionOutputLayerTest, params3Inputs, DetectionOutputLayerTest::getTestCaseName);
+
+/* =============== 5 inputs cases =============== */
+
+const std::vector<ParamsWhichSizeDepends> specificParams5In = {
+    ParamsWhichSizeDepends{true, true, true, 1, 1, {1, 60}, {1, 165}, {1, 1, 60}, {1, 30}, {1, 60}},
+    ParamsWhichSizeDepends{true, false, true, 1, 1, {1, 660}, {1, 165}, {1, 1, 60}, {1, 30}, {1, 660}},
+    ParamsWhichSizeDepends{false, true, true, 1, 1, {1, 60}, {1, 165}, {1, 2, 60}, {1, 30}, {1, 60}},
+    ParamsWhichSizeDepends{false, false, true, 1, 1, {1, 660}, {1, 165}, {1, 2, 60}, {1, 30}, {1, 660}},
+
+    ParamsWhichSizeDepends{true, true, false, 10, 10, {1, 60}, {1, 165}, {1, 1, 75}, {1, 30}, {1, 60}},
+    ParamsWhichSizeDepends{true, false, false, 10, 10, {1, 660}, {1, 165}, {1, 1, 75}, {1, 30}, {1, 660}},
+    ParamsWhichSizeDepends{false, true, false, 10, 10, {1, 60}, {1, 165}, {1, 2, 75}, {1, 30}, {1, 60}},
+    ParamsWhichSizeDepends{false, false, false, 10, 10, {1, 660}, {1, 165}, {1, 2, 75}, {1, 30}, {1, 660}}
+};
+
+const auto params5Inputs = ::testing::Combine(
+        commonAttributes,
+        ::testing::ValuesIn(specificParams5In),
+        ::testing::ValuesIn(numberBatch),
+        ::testing::Values(objectnessScore),
+        ::testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_DetectionOutput5In, DetectionOutputLayerTest, params5Inputs, DetectionOutputLayerTest::getTestCaseName);
+
+}  // namespace
--- a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp
@ -82,18 +82,18 @@ class ActivationLayerTest : public testing::WithParamInterface<activationParams>
 public:
    ngraph::helpers::ActivationTypes activationType;
    static std::string getTestCaseName(const testing::TestParamInfo<activationParams> &obj);
-    virtual InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const;
+    InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override;

 protected:
-    void SetUp();
+    void SetUp() override;
 };

 class ActivationParamLayerTest : public ActivationLayerTest {
 public:
-    void Infer();
+    void Infer() override;

 protected:
-    void SetUp();
+    void SetUp() override;

 private:
    void generateActivationBlob();
--- a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/detection_output.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/detection_output.hpp
@ -0,0 +1,71 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+#include <string>
+#include <tuple>
+
+#include "ngraph/op/detection_output.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+enum {
+    idxLocation,
+    idxConfidence,
+    idxPriors,
+    idxArmConfidence,
+    idxArmLocation,
+    numInputs
+};
+
+using DetectionOutputAttributes = std::tuple<
+    int,                // numClasses
+    int,                // backgroundLabelId
+    int,                // topK
+    std::vector<int>,   // keepTopK
+    std::string,        // codeType
+    float,              // nmsThreshold
+    float,              // confidenceThreshold
+    bool,               // clip_afterNms
+    bool,               // clip_beforeNms
+    bool                // decreaseLabelId
+>;
+
+using ParamsWhichSizeDepends = std::tuple<
+    bool,                        // varianceEncodedInTarget
+    bool,                        // shareLocation
+    bool,                        // normalized
+    size_t,                      // inputHeight
+    size_t,                      // inputWidth
+    InferenceEngine::SizeVector, // "Location" input
+    InferenceEngine::SizeVector, // "Confidence" input
+    InferenceEngine::SizeVector, // "Priors" input
+    InferenceEngine::SizeVector, // "ArmConfidence" input
+    InferenceEngine::SizeVector  // "ArmLocation" input
+>;
+
+using DetectionOutputParams = std::tuple<
+    DetectionOutputAttributes,
+    ParamsWhichSizeDepends,
+    size_t,     // Number of batch
+    float,      // objectnessScore
+    std::string // Device name
+>;
+
+class DetectionOutputLayerTest : public testing::WithParamInterface<DetectionOutputParams>, public LayerTestsUtils::LayerTestsCommon {
+  public:
+    static std::string getTestCaseName(testing::TestParamInfo<DetectionOutputParams> obj);
+    ngraph::op::DetectionOutputAttrs attrs;
+    std::vector<InferenceEngine::SizeVector> inShapes;
+    void Infer() override;
+    void Compare(const std::vector<std::uint8_t> &expected, const InferenceEngine::Blob::Ptr &actual) override;
+  protected:
+    void SetUp() override;
+};
+
+}  // namespace LayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/range.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/range.hpp
@ -26,7 +26,7 @@ class RangeLayerTest : public testing::WithParamInterface<RangeParams>,
    float start, stop, step;
 public:
    static std::string getTestCaseName(testing::TestParamInfo<RangeParams> obj);
-    void Infer();
+    void Infer() override;

 protected:
    void SetUp() override;
--- a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/detection_output.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/detection_output.cpp
@ -0,0 +1,164 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <tuple>
+#include <vector>
+
+#include "ngraph_functions/builders.hpp"
+#include "common_test_utils/data_utils.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "single_layer_tests/detection_output.hpp"
+
+namespace LayerTestsDefinitions {
+
+std::string DetectionOutputLayerTest::getTestCaseName(testing::TestParamInfo<DetectionOutputParams> obj) {
+    DetectionOutputAttributes commonAttrs;
+    ParamsWhichSizeDepends specificAttrs;
+    ngraph::op::DetectionOutputAttrs attrs;
+    size_t batch;
+    std::string targetDevice;
+    std::tie(commonAttrs, specificAttrs, batch, attrs.objectness_score, targetDevice) = obj.param;
+
+    std::tie(attrs.num_classes, attrs.background_label_id, attrs.top_k, attrs.keep_top_k, attrs.code_type, attrs.nms_threshold, attrs.confidence_threshold,
+             attrs.clip_after_nms, attrs.clip_before_nms, attrs.decrease_label_id) = commonAttrs;
+
+    const size_t numInputs = 5;
+    std::vector<InferenceEngine::SizeVector> inShapes(numInputs);
+    std::tie(attrs.variance_encoded_in_target, attrs.share_location, attrs.normalized, attrs.input_height, attrs.input_width,
+             inShapes[idxLocation], inShapes[idxConfidence], inShapes[idxPriors], inShapes[idxArmConfidence], inShapes[idxArmLocation]) = specificAttrs;
+
+    if (inShapes[idxArmConfidence].empty()) {
+        inShapes.resize(3);
+    }
+
+    for (size_t i = 0; i < inShapes.size(); i++) {
+        inShapes[i][0] = batch;
+    }
+
+    std::ostringstream result;
+    result << "IS = { ";
+    result << "LOC=" << CommonTestUtils::vec2str(inShapes[0]) << "_";
+    result << "CONF=" << CommonTestUtils::vec2str(inShapes[1]) << "_";
+    result << "PRIOR=" << CommonTestUtils::vec2str(inShapes[2]);
+    std::string armConf, armLoc;
+    if (inShapes.size() > 3) {
+        armConf = "_ARM_CONF=" + CommonTestUtils::vec2str(inShapes[3]) + "_";
+        armLoc = "ARM_LOC=" + CommonTestUtils::vec2str(inShapes[4]);
+    }
+    result << armConf;
+    result << armLoc << " }_";
+
+    result << "Classes=" << attrs.num_classes << "_";
+    result << "backgrId=" << attrs.background_label_id << "_";
+    result << "topK="  << attrs.top_k << "_";
+    result << "varEnc=" << attrs.variance_encoded_in_target << "_";
+    result << "keepTopK=" << CommonTestUtils::vec2str(attrs.keep_top_k) << "_";
+    result << "codeType=" << attrs.code_type << "_";
+    result << "shareLoc=" << attrs.share_location << "_";
+    result << "nmsThr=" << attrs.nms_threshold << "_";
+    result << "confThr=" << attrs.confidence_threshold << "_";
+    result << "clipAfterNms=" << attrs.clip_after_nms << "_";
+    result << "clipBeforeNms=" << attrs.clip_before_nms << "_";
+    result << "decrId=" << attrs.decrease_label_id << "_";
+    result << "norm=" << attrs.normalized << "_";
+    result << "inH=" << attrs.input_height << "_";
+    result << "inW=" << attrs.input_width << "_";
+    result << "OS=" << attrs.objectness_score << "_";
+    result << "TargetDevice=" << targetDevice;
+    return result.str();
+}
+
+void DetectionOutputLayerTest::Infer() {
+    inferRequest = executableNetwork.CreateInferRequest();
+    inputs.clear();
+
+    size_t it = 0;
+    for (const auto &input : cnnNetwork.getInputsInfo()) {
+        const auto &info = input.second;
+        InferenceEngine::Blob::Ptr blob;
+        int32_t resolution = 1;
+        uint32_t range = 1;
+        if (it == 2) {
+            if (attrs.normalized) {
+                resolution = 100;
+            } else {
+                range = 10;
+            }
+        } else if (it == 1 || it == 3) {
+            resolution = 1000;
+        } else {
+            resolution = 10;
+        }
+        blob = make_blob_with_precision(info->getTensorDesc());
+        blob->allocate();
+        CommonTestUtils::fill_data_random_float<InferenceEngine::Precision::FP32>(blob, range, 0, resolution);
+        inferRequest.SetBlob(info->name(), blob);
+        inputs.push_back(blob);
+        it++;
+    }
+    inferRequest.Infer();
+}
+
+void DetectionOutputLayerTest::Compare(const std::vector<std::uint8_t> &expected, const InferenceEngine::Blob::Ptr &actual) {
+    ASSERT_EQ(expected.size(), actual->byteSize());
+
+    size_t expSize = 0;
+    size_t actSize = 0;
+
+    const auto &expectedBuffer = expected.data();
+    auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+    IE_ASSERT(memory);
+    const auto lockedMemory = memory->wmap();
+    const auto actualBuffer = lockedMemory.as<const std::uint8_t *>();
+
+    const float *expBuf = reinterpret_cast<const float *>(expectedBuffer);
+    const float *actBuf = reinterpret_cast<const float *>(actualBuffer);
+    for (size_t i = 0; i < actual->size(); i+=7) {
+        if (expBuf[i] == -1)
+            break;
+        expSize += 7;
+    }
+    for (size_t i = 0; i < actual->size(); i+=7) {
+        if (actBuf[i] == -1)
+            break;
+        actSize += 7;
+    }
+    ASSERT_EQ(expSize, actSize);
+    LayerTestsCommon::Compare<float>(expBuf, actBuf, expSize, 1e-2f);
+}
+
+void DetectionOutputLayerTest::SetUp() {
+    DetectionOutputAttributes commonAttrs;
+    ParamsWhichSizeDepends specificAttrs;
+    size_t batch;
+    std::tie(commonAttrs, specificAttrs, batch, attrs.objectness_score, targetDevice) = this->GetParam();
+
+    std::tie(attrs.num_classes, attrs.background_label_id, attrs.top_k, attrs.keep_top_k, attrs.code_type, attrs.nms_threshold, attrs.confidence_threshold,
+             attrs.clip_after_nms, attrs.clip_before_nms, attrs.decrease_label_id) = commonAttrs;
+
+    inShapes.resize(numInputs);
+    std::tie(attrs.variance_encoded_in_target, attrs.share_location, attrs.normalized, attrs.input_height, attrs.input_width,
+             inShapes[idxLocation], inShapes[idxConfidence], inShapes[idxPriors], inShapes[idxArmConfidence], inShapes[idxArmLocation]) = specificAttrs;
+
+    if (inShapes[idxArmConfidence].empty()) {
+        inShapes.resize(3);
+    }
+
+    for (size_t i = 0; i < inShapes.size(); i++) {
+        inShapes[i][0] = batch;
+    }
+
+    auto params = ngraph::builder::makeParams(ngraph::element::f32, inShapes);
+    auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::opset3::Parameter>(params));
+    auto detOut = ngraph::builder::makeDetectionOutput(paramOuts, attrs);
+    ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(detOut)};
+    function = std::make_shared<ngraph::Function>(results, params, "DetectionOutput");
+}
+
+TEST_P(DetectionOutputLayerTest, CompareWithRefs) {
+    Run();
+};
+
+}  // namespace LayerTestsDefinitions
+
--- a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp
+++ b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp
@ -96,7 +96,7 @@ protected:

    void LoadNetwork();

-    void Infer();
+    virtual void Infer();

    TargetDevice targetDevice;
    std::shared_ptr<ngraph::Function> function;
--- a/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp
+++ b/inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp
@ -314,5 +314,8 @@ std::shared_ptr<ngraph::Node> makeLogical(const ngraph::Output<Node> &in0,
                                          const ngraph::Output<Node> &in1,
                                          ngraph::helpers::LogicalTypes logicalType);

+std::shared_ptr<ngraph::Node> makeDetectionOutput(const ngraph::OutputVector &inputs,
+                                                  const ngraph::op::DetectionOutputAttrs& attrs);
+
 }  // namespace builder
 }  // namespace ngraph
--- a/inference-engine/tests/ngraph_functions/src/detection_output.cpp
+++ b/inference-engine/tests/ngraph_functions/src/detection_output.cpp
@ -0,0 +1,21 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph_functions/builders.hpp"
+
+namespace ngraph {
+namespace builder {
+
+std::shared_ptr<ngraph::Node> makeDetectionOutput(const ngraph::OutputVector &inputs,
+                                                  const ngraph::op::DetectionOutputAttrs& attrs) {
+    if (inputs.size() == 3)
+        return std::make_shared<ngraph::opset3::DetectionOutput>(inputs[0], inputs[1], inputs[2], attrs);
+    else if (inputs.size() == 5)
+        return std::make_shared<ngraph::opset3::DetectionOutput>(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], attrs);
+    else
+        throw std::runtime_error("DetectionOutput layer supports only 3 or 5 inputs");
+}
+
+}  // namespace builder
+}  // namespace ngraph
--- a/ngraph/test/runtime/interpreter/int_executable.hpp
+++ b/ngraph/test/runtime/interpreter/int_executable.hpp
@ -92,6 +92,8 @@
 #include "op/convolution.hpp"
 #include "op/group_conv.hpp"

+#include "reference/detection_output.hpp"
+
 namespace ngraph
 {
    namespace runtime
@ -1112,6 +1114,36 @@ protected:
            }
            break;
        }
+        case OP_TYPEID::DetectionOutput_v0:
+        {
+            const op::DetectionOutput* detOut = static_cast<const op::DetectionOutput*>(&node);
+            reference::referenceDetectionOutput<T> refDetOut(
+                detOut->get_attrs(), node.get_input_shape(0), node.get_input_shape(2));
+            if (node.get_input_size() == 3)
+            {
+                refDetOut.run(args[0]->get_data_ptr<const T>(),
+                              args[1]->get_data_ptr<const T>(),
+                              args[2]->get_data_ptr<const T>(),
+                              nullptr,
+                              nullptr,
+                              out[0]->get_data_ptr<T>());
+            }
+            else if (node.get_input_size() == 5)
+            {
+                refDetOut.run(args[0]->get_data_ptr<const T>(),
+                              args[1]->get_data_ptr<const T>(),
+                              args[2]->get_data_ptr<const T>(),
+                              args[3]->get_data_ptr<const T>(),
+                              args[4]->get_data_ptr<const T>(),
+                              out[0]->get_data_ptr<T>());
+            }
+            else
+            {
+                throw ngraph_error("DetectionOutput layer supports only 3 or 5 inputs");
+            }
+
+            break;
+        }

        // Fused Ops are not supported in interpreter. They need to be decomposed before execution
        case OP_TYPEID::DepthToSpace:
--- a/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
+++ b/ngraph/test/runtime/interpreter/opset_int_tbl.hpp
@ -18,6 +18,10 @@
 #include "opset0_tbl.hpp"
 #undef ID_SUFFIX

+#define ID_SUFFIX(NAME) NAME##_v0
+NGRAPH_OP(DetectionOutput, op::v0)
+#undef ID_SUFFIX
+
 #define ID_SUFFIX(NAME) NAME##_v1
 NGRAPH_OP(LessEqual, op::v1)
 NGRAPH_OP(LogicalAnd, op::v1)
--- a/ngraph/test/runtime/interpreter/reference/detection_output.hpp
+++ b/ngraph/test/runtime/interpreter/reference/detection_output.hpp
@ -0,0 +1,669 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "ngraph/shape.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace reference
+        {
+            enum
+            {
+                idxLocation,
+                idxConfidence,
+                idxPriors,
+                idxArmConfidence,
+                idxArmLocation,
+                numInputs
+            };
+
+            template <typename dataType>
+            class referenceDetectionOutput
+            {
+            private:
+                struct NormalizedBBox
+                {
+                    dataType xmin = 0;
+                    dataType ymin = 0;
+                    dataType xmax = 0;
+                    dataType ymax = 0;
+                    dataType size = 0;
+                };
+                using LabelBBox = std::map<int, std::vector<NormalizedBBox>>;
+
+                ngraph::op::DetectionOutputAttrs attrs;
+                size_t numImages;
+                size_t priorSize;
+                size_t numPriors;
+                size_t numLocClasses;
+                size_t offset;
+
+                void GetLocPredictions(const dataType* locData, std::vector<LabelBBox>& locations)
+                {
+                    locations.resize(numImages);
+                    for (size_t i = 0; i < numImages; ++i)
+                    {
+                        LabelBBox& labelBbox = locations[i];
+                        for (size_t p = 0; p < numPriors; ++p)
+                        {
+                            size_t startIdx = p * numLocClasses * 4;
+                            for (size_t c = 0; c < numLocClasses; ++c)
+                            {
+                                int label = attrs.share_location ? -1 : c;
+                                if (labelBbox.find(label) == labelBbox.end())
+                                {
+                                    labelBbox[label].resize(numPriors);
+                                }
+                                labelBbox[label][p].xmin = locData[startIdx + c * 4];
+                                labelBbox[label][p].ymin = locData[startIdx + c * 4 + 1];
+                                labelBbox[label][p].xmax = locData[startIdx + c * 4 + 2];
+                                labelBbox[label][p].ymax = locData[startIdx + c * 4 + 3];
+                            }
+                        }
+                        locData += numPriors * numLocClasses * 4;
+                    }
+                }
+
+                void GetConfidenceScores(
+                    const dataType* confData,
+                    std::vector<std::map<int, std::vector<dataType>>>& confPreds)
+                {
+                    confPreds.resize(numImages);
+                    for (int i = 0; i < numImages; ++i)
+                    {
+                        std::map<int, std::vector<dataType>>& labelScores = confPreds[i];
+                        for (int p = 0; p < numPriors; ++p)
+                        {
+                            int startIdx = p * attrs.num_classes;
+                            for (int c = 0; c < attrs.num_classes; ++c)
+                            {
+                                labelScores[c].push_back(confData[startIdx + c]);
+                            }
+                        }
+                        confData += numPriors * attrs.num_classes;
+                    }
+                }
+
+                void OSGetConfidenceScores(
+                    const dataType* confData,
+                    const dataType* armConfData,
+                    std::vector<std::map<int, std::vector<dataType>>>& confPreds)
+                {
+                    confPreds.resize(numImages);
+                    for (int i = 0; i < numImages; ++i)
+                    {
+                        std::map<int, std::vector<dataType>>& labelScores = confPreds[i];
+                        for (int p = 0; p < numPriors; ++p)
+                        {
+                            int startIdx = p * attrs.num_classes;
+                            if (armConfData[p * 2 + 1] < attrs.objectness_score)
+                            {
+                                for (int c = 0; c < attrs.num_classes; ++c)
+                                {
+                                    c == attrs.background_label_id ? labelScores[c].push_back(1)
+                                                                   : labelScores[c].push_back(0);
+                                }
+                            }
+                            else
+                            {
+                                for (int c = 0; c < attrs.num_classes; ++c)
+                                {
+                                    labelScores[c].push_back(confData[startIdx + c]);
+                                }
+                            }
+                        }
+                        confData += numPriors * attrs.num_classes;
+                        armConfData += numPriors * 2;
+                    }
+                }
+
+                dataType BBoxSize(const NormalizedBBox& bbox)
+                {
+                    if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
+                    {
+                        return 0;
+                    }
+                    else
+                    {
+                        dataType width = bbox.xmax - bbox.xmin;
+                        dataType height = bbox.ymax - bbox.ymin;
+                        return width * height;
+                    }
+                }
+
+                void GetPriorBBoxes(const dataType* priorData,
+                                    std::vector<std::vector<NormalizedBBox>>& priorBboxes,
+                                    std::vector<std::vector<std::vector<dataType>>>& priorVariances)
+                {
+                    priorBboxes.resize(numImages);
+                    priorVariances.resize(numImages);
+                    for (int n = 0; n < numImages; n++)
+                    {
+                        priorData += attrs.variance_encoded_in_target
+                                         ? n * numPriors * priorSize
+                                         : 2 * n * numPriors * priorSize;
+                        std::vector<NormalizedBBox>& currPrBbox = priorBboxes[n];
+                        std::vector<std::vector<dataType>>& currPrVar = priorVariances[n];
+                        for (int i = 0; i < numPriors; ++i)
+                        {
+                            int start_idx = i * priorSize;
+                            NormalizedBBox bbox;
+                            bbox.xmin = priorData[start_idx + 0 + offset];
+                            bbox.ymin = priorData[start_idx + 1 + offset];
+                            bbox.xmax = priorData[start_idx + 2 + offset];
+                            bbox.ymax = priorData[start_idx + 3 + offset];
+                            dataType bbox_size = BBoxSize(bbox);
+                            bbox.size = bbox_size;
+                            currPrBbox.push_back(bbox);
+                        }
+                        if (!attrs.variance_encoded_in_target)
+                        {
+                            const dataType* priorVar = priorData + numPriors * priorSize;
+                            for (int i = 0; i < numPriors; ++i)
+                            {
+                                int start_idx = i * 4;
+                                std::vector<dataType> var;
+                                for (int j = 0; j < 4; ++j)
+                                {
+                                    var.push_back(priorVar[start_idx + j]);
+                                }
+                                currPrVar.push_back(var);
+                            }
+                        }
+                    }
+                }
+
+                void DecodeBBox(const NormalizedBBox& priorBboxes,
+                                const std::vector<dataType>& priorVariances,
+                                const NormalizedBBox& bbox,
+                                NormalizedBBox& decodeBbox)
+                {
+                    dataType priorXmin = priorBboxes.xmin;
+                    dataType priorYmin = priorBboxes.ymin;
+                    dataType priorXmax = priorBboxes.xmax;
+                    dataType priorYmax = priorBboxes.ymax;
+
+                    if (!attrs.normalized)
+                    {
+                        priorXmin /= attrs.input_width;
+                        priorYmin /= attrs.input_height;
+                        priorXmax /= attrs.input_width;
+                        priorYmax /= attrs.input_height;
+                    }
+                    if (attrs.code_type == "caffe.PriorBoxParameter.CORNER")
+                    {
+                        if (attrs.variance_encoded_in_target)
+                        {
+                            decodeBbox.xmin = priorXmin + bbox.xmin;
+                            decodeBbox.ymin = priorYmin + bbox.ymin;
+                            decodeBbox.xmax = priorXmax + bbox.xmax;
+                            decodeBbox.ymax = priorYmax + bbox.ymax;
+                        }
+                        else
+                        {
+                            decodeBbox.xmin = priorXmin + priorVariances[0] * bbox.xmin;
+                            decodeBbox.ymin = priorYmin + priorVariances[1] * bbox.ymin;
+                            decodeBbox.xmax = priorXmax + priorVariances[2] * bbox.xmax;
+                            decodeBbox.ymax = priorYmax + priorVariances[3] * bbox.ymax;
+                        }
+                    }
+                    else if (attrs.code_type == "caffe.PriorBoxParameter.CENTER_SIZE")
+                    {
+                        dataType priorWidth = priorXmax - priorXmin;
+                        dataType priorHeight = priorYmax - priorYmin;
+                        dataType priorCenterX = (priorXmin + priorXmax) / 2;
+                        dataType priorCenterY = (priorYmin + priorYmax) / 2;
+                        dataType decodeBboxCenterX, decodeBboxCenterY;
+                        dataType decodeBboxWidth, decodeBboxHeight;
+                        if (attrs.variance_encoded_in_target)
+                        {
+                            decodeBboxCenterX = bbox.xmin * priorWidth + priorCenterX;
+                            decodeBboxCenterY = bbox.ymin * priorHeight + priorCenterY;
+                            decodeBboxWidth = std::exp(bbox.xmax) * priorWidth;
+                            decodeBboxHeight = std::exp(bbox.ymax) * priorHeight;
+                        }
+                        else
+                        {
+                            decodeBboxCenterX =
+                                priorVariances[0] * bbox.xmin * priorWidth + priorCenterX;
+                            decodeBboxCenterY =
+                                priorVariances[1] * bbox.ymin * priorHeight + priorCenterY;
+                            decodeBboxWidth = std::exp(priorVariances[2] * bbox.xmax) * priorWidth;
+                            decodeBboxHeight =
+                                std::exp(priorVariances[3] * bbox.ymax) * priorHeight;
+                        }
+                        decodeBbox.xmin = decodeBboxCenterX - decodeBboxWidth / 2;
+                        decodeBbox.ymin = decodeBboxCenterY - decodeBboxHeight / 2;
+                        decodeBbox.xmax = decodeBboxCenterX + decodeBboxWidth / 2;
+                        decodeBbox.ymax = decodeBboxCenterY + decodeBboxHeight / 2;
+                    }
+                    if (attrs.clip_before_nms)
+                    {
+                        decodeBbox.xmin =
+                            std::max<dataType>(0, std::min<dataType>(1, decodeBbox.xmin));
+                        decodeBbox.ymin =
+                            std::max<dataType>(0, std::min<dataType>(1, decodeBbox.ymin));
+                        decodeBbox.xmax =
+                            std::max<dataType>(0, std::min<dataType>(1, decodeBbox.xmax));
+                        decodeBbox.ymax =
+                            std::max<dataType>(0, std::min<dataType>(1, decodeBbox.ymax));
+                    }
+                    dataType bboxSize = BBoxSize(decodeBbox);
+                    decodeBbox.size = bboxSize;
+                }
+
+                void DecodeBBoxes(const std::vector<NormalizedBBox>& priorBboxes,
+                                  const std::vector<std::vector<dataType>>& priorVariances,
+                                  const std::vector<NormalizedBBox>& labelLocPreds,
+                                  std::vector<NormalizedBBox>& decodeBboxes)
+                {
+                    int numBboxes = priorBboxes.size();
+                    for (int i = 0; i < numBboxes; ++i)
+                    {
+                        NormalizedBBox decodeBbox;
+                        DecodeBBox(priorBboxes[i], priorVariances[i], labelLocPreds[i], decodeBbox);
+                        decodeBboxes.push_back(decodeBbox);
+                    }
+                }
+
+                void DecodeBBoxesAll(
+                    const std::vector<LabelBBox>& locPreds,
+                    const std::vector<std::vector<NormalizedBBox>>& priorBboxes,
+                    const std::vector<std::vector<std::vector<dataType>>>& priorVariances,
+                    std::vector<LabelBBox>& decodeBboxes)
+                {
+                    decodeBboxes.resize(numImages);
+                    for (int i = 0; i < numImages; ++i)
+                    {
+                        LabelBBox& decodeBboxesImage = decodeBboxes[i];
+                        const std::vector<NormalizedBBox>& currPrBbox = priorBboxes[i];
+                        const std::vector<std::vector<dataType>>& currPrVar = priorVariances[i];
+                        for (int c = 0; c < numLocClasses; ++c)
+                        {
+                            int label = attrs.share_location ? -1 : c;
+                            if (label == attrs.background_label_id)
+                            {
+                                continue;
+                            }
+                            const std::vector<NormalizedBBox>& labelLocPreds =
+                                locPreds[i].find(label)->second;
+                            DecodeBBoxes(
+                                currPrBbox, currPrVar, labelLocPreds, decodeBboxesImage[label]);
+                        }
+                    }
+                }
+
+                void CasRegDecodeBBoxesAll(
+                    const std::vector<LabelBBox>& locPreds,
+                    const std::vector<std::vector<NormalizedBBox>>& priorBboxes,
+                    const std::vector<std::vector<std::vector<dataType>>>& priorVariances,
+                    std::vector<LabelBBox>& decodeBboxes,
+                    const std::vector<LabelBBox>& armLocPreds)
+                {
+                    decodeBboxes.resize(numImages);
+                    for (int i = 0; i < numImages; ++i)
+                    {
+                        LabelBBox& decodeBboxesImage = decodeBboxes[i];
+                        const std::vector<NormalizedBBox>& currPrBbox = priorBboxes[i];
+                        const std::vector<std::vector<dataType>>& currPrVar = priorVariances[i];
+                        for (int c = 0; c < numLocClasses; ++c)
+                        {
+                            int label = attrs.share_location ? -1 : c;
+                            if (label == attrs.background_label_id)
+                            {
+                                continue;
+                            }
+                            const std::vector<NormalizedBBox>& labelArmLocPreds =
+                                armLocPreds[i].find(label)->second;
+                            std::vector<NormalizedBBox> decodePriorBboxes;
+                            DecodeBBoxes(
+                                currPrBbox, currPrVar, labelArmLocPreds, decodePriorBboxes);
+                            const std::vector<NormalizedBBox>& labelLocPreds =
+                                locPreds[i].find(label)->second;
+                            DecodeBBoxes(decodePriorBboxes,
+                                         currPrVar,
+                                         labelLocPreds,
+                                         decodeBboxesImage[label]);
+                        }
+                    }
+                }
+
+                template <typename T>
+                static bool SortScorePairDescend(const std::pair<dataType, T>& pair1,
+                                                 const std::pair<dataType, T>& pair2)
+                {
+                    return pair1.first > pair2.first;
+                }
+
+                void GetMaxScoreIndex(const std::vector<dataType>& scores,
+                                      const dataType threshold,
+                                      const int topK,
+                                      std::vector<std::pair<dataType, int>>& scoreIndexVec)
+                {
+                    for (int i = 0; i < scores.size(); ++i)
+                    {
+                        if (scores[i] > threshold)
+                        {
+                            scoreIndexVec.push_back(std::make_pair(scores[i], i));
+                        }
+                    }
+
+                    std::stable_sort(
+                        scoreIndexVec.begin(), scoreIndexVec.end(), SortScorePairDescend<int>);
+                    if (topK > -1 && topK < scoreIndexVec.size())
+                    {
+                        scoreIndexVec.resize(topK);
+                    }
+                }
+
+                void IntersectBBox(const NormalizedBBox& bbox1,
+                                   const NormalizedBBox& bbox2,
+                                   NormalizedBBox& intersectBbox)
+                {
+                    if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin ||
+                        bbox2.ymin > bbox1.ymax || bbox2.ymax < bbox1.ymin)
+                    {
+                        intersectBbox.xmin = 0;
+                        intersectBbox.ymin = 0;
+                        intersectBbox.xmax = 0;
+                        intersectBbox.ymax = 0;
+                    }
+                    else
+                    {
+                        intersectBbox.xmin = std::max<dataType>(bbox1.xmin, bbox2.xmin);
+                        intersectBbox.ymin = std::max<dataType>(bbox1.ymin, bbox2.ymin);
+                        intersectBbox.xmax = std::min<dataType>(bbox1.xmax, bbox2.xmax);
+                        intersectBbox.ymax = std::min<dataType>(bbox1.ymax, bbox2.ymax);
+                    }
+                }
+
+                dataType JaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2)
+                {
+                    NormalizedBBox intersectBbox;
+                    IntersectBBox(bbox1, bbox2, intersectBbox);
+                    dataType intersectWidth, intersectHeight;
+                    intersectWidth = intersectBbox.xmax - intersectBbox.xmin;
+                    intersectHeight = intersectBbox.ymax - intersectBbox.ymin;
+                    if (intersectWidth > 0 && intersectHeight > 0)
+                    {
+                        dataType intersect_size = intersectWidth * intersectHeight;
+                        dataType bbox1_size = BBoxSize(bbox1);
+                        dataType bbox2_size = BBoxSize(bbox2);
+
+                        return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+                    }
+                    else
+                    {
+                        return 0.0f;
+                    }
+                }
+
+                void caffeNMS(const std::vector<NormalizedBBox>& bboxes,
+                              const std::vector<dataType>& scores,
+                              std::vector<int>& indices)
+                {
+                    std::vector<std::pair<dataType, int>> scoreIndexVec;
+                    GetMaxScoreIndex(
+                        scores, attrs.confidence_threshold, attrs.top_k, scoreIndexVec);
+                    while (scoreIndexVec.size() != 0)
+                    {
+                        const int idx = scoreIndexVec.front().second;
+                        bool keep = true;
+                        for (int k = 0; k < indices.size(); ++k)
+                        {
+                            const int kept_idx = indices[k];
+                            dataType overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
+                            if (overlap > attrs.nms_threshold)
+                            {
+                                keep = false;
+                                break;
+                            }
+                        }
+                        if (keep)
+                        {
+                            indices.push_back(idx);
+                        }
+                        scoreIndexVec.erase(scoreIndexVec.begin());
+                    }
+                }
+
+                void mxNetNms(const LabelBBox& decodeBboxesImage,
+                              const std::map<int, std::vector<dataType>>& confScores,
+                              std::map<int, std::vector<int>>& indices)
+                {
+                    std::vector<std::pair<dataType, std::pair<int, int>>> scoreIndexPairs;
+                    for (int p = 0; p < numPriors; p++)
+                    {
+                        dataType conf = -1;
+                        int id = 0;
+                        for (int c = 1; c < attrs.num_classes; c++)
+                        {
+                            dataType temp = confScores.at(c)[p];
+                            if (temp > conf)
+                            {
+                                conf = temp;
+                                id = c;
+                            }
+                        }
+                        if (id > 0 && conf >= attrs.confidence_threshold)
+                        {
+                            scoreIndexPairs.push_back(std::make_pair(conf, std::make_pair(id, p)));
+                        }
+                    }
+                    std::sort(scoreIndexPairs.begin(),
+                              scoreIndexPairs.end(),
+                              SortScorePairDescend<std::pair<int, int>>);
+
+                    if (attrs.top_k != -1)
+                        if (scoreIndexPairs.size() > attrs.top_k)
+                            scoreIndexPairs.resize(attrs.top_k);
+
+                    while (scoreIndexPairs.size() != 0)
+                    {
+                        const int cls = scoreIndexPairs.front().second.first;
+                        const int prior = scoreIndexPairs.front().second.second;
+                        std::vector<int>& currInd = indices[cls];
+                        bool keep = true;
+                        for (int i = 0; i < currInd.size(); i++)
+                        {
+                            const int keptIdx = currInd[i];
+                            auto currBbox = attrs.share_location ? decodeBboxesImage.at(-1)
+                                                                 : decodeBboxesImage.at(cls);
+                            dataType overlap = JaccardOverlap(currBbox[prior], currBbox[keptIdx]);
+                            if (overlap > attrs.nms_threshold)
+                            {
+                                keep = false;
+                                break;
+                            }
+                        }
+                        if (keep)
+                        {
+                            currInd.push_back(prior);
+                        }
+                        scoreIndexPairs.erase(scoreIndexPairs.begin());
+                    }
+                }
+
+            public:
+                referenceDetectionOutput(const ngraph::op::DetectionOutputAttrs& _attrs,
+                                         const ngraph::Shape& locShape,
+                                         const ngraph::Shape& priorsShape)
+                    : attrs(_attrs)
+                {
+                    numImages = locShape[0];
+                    priorSize = _attrs.normalized ? 4 : 5;
+                    offset = _attrs.normalized ? 0 : 1;
+                    numPriors = priorsShape[2] / priorSize;
+                    numLocClasses =
+                        _attrs.share_location ? 1 : static_cast<size_t>(_attrs.num_classes);
+                }
+
+                void run(const dataType* _location,
+                         const dataType* _confidence,
+                         const dataType* _priors,
+                         const dataType* _armConfidence,
+                         const dataType* _armLocation,
+                         dataType* result)
+                {
+                    bool withAddBoxPred = _armConfidence != nullptr && _armLocation != nullptr;
+                    std::vector<LabelBBox> armLocPreds;
+                    if (withAddBoxPred)
+                    {
+                        GetLocPredictions(_armLocation, armLocPreds);
+                    }
+                    std::vector<LabelBBox> locPreds;
+                    GetLocPredictions(_location, locPreds);
+                    std::vector<std::map<int, std::vector<dataType>>> confPreds;
+                    if (withAddBoxPred)
+                    {
+                        OSGetConfidenceScores(_confidence, _armConfidence, confPreds);
+                    }
+                    else
+                    {
+                        GetConfidenceScores(_confidence, confPreds);
+                    }
+                    std::vector<std::vector<NormalizedBBox>> priorBboxes;
+                    std::vector<std::vector<std::vector<dataType>>> priorVariances;
+                    GetPriorBBoxes(_priors, priorBboxes, priorVariances);
+                    std::vector<LabelBBox> decodeBboxes;
+                    if (withAddBoxPred)
+                    {
+                        CasRegDecodeBBoxesAll(
+                            locPreds, priorBboxes, priorVariances, decodeBboxes, armLocPreds);
+                    }
+                    else
+                    {
+                        DecodeBBoxesAll(locPreds, priorBboxes, priorVariances, decodeBboxes);
+                    }
+
+                    int numKept = 0;
+                    std::vector<std::map<int, std::vector<int>>> allIndices;
+                    for (int i = 0; i < numImages; ++i)
+                    {
+                        const LabelBBox& decodeBboxesImage = decodeBboxes[i];
+                        const std::map<int, std::vector<dataType>>& confScores = confPreds[i];
+                        std::map<int, std::vector<int>> indices;
+                        int numDet = 0;
+                        if (!attrs.decrease_label_id)
+                        {
+                            // Caffe style
+                            for (int c = 0; c < attrs.num_classes; ++c)
+                            {
+                                if (c == attrs.background_label_id)
+                                {
+                                    continue;
+                                }
+                                const std::vector<dataType>& scores = confScores.find(c)->second;
+                                int label = attrs.share_location ? -1 : c;
+                                const std::vector<NormalizedBBox>& bboxes =
+                                    decodeBboxesImage.find(label)->second;
+                                caffeNMS(bboxes, scores, indices[c]);
+                                numDet += indices[c].size();
+                            }
+                        }
+                        else
+                        {
+                            // MXNet style
+                            mxNetNms(decodeBboxesImage, confScores, indices);
+                            for (auto it = indices.begin(); it != indices.end(); it++)
+                                numDet += it->second.size();
+                        }
+                        if (attrs.keep_top_k[0] > -1 && numDet > attrs.keep_top_k[0])
+                        {
+                            std::vector<std::pair<dataType, std::pair<int, int>>> scoreIndexPairs;
+                            for (auto it = indices.begin(); it != indices.end(); ++it)
+                            {
+                                int label = it->first;
+                                const std::vector<int>& labelIndices = it->second;
+                                const std::vector<dataType>& scores =
+                                    confScores.find(label)->second;
+                                for (int j = 0; j < labelIndices.size(); ++j)
+                                {
+                                    int idx = labelIndices[j];
+                                    scoreIndexPairs.push_back(
+                                        std::make_pair(scores[idx], std::make_pair(label, idx)));
+                                }
+                            }
+                            std::sort(scoreIndexPairs.begin(),
+                                      scoreIndexPairs.end(),
+                                      SortScorePairDescend<std::pair<int, int>>);
+                            scoreIndexPairs.resize(attrs.keep_top_k[0]);
+                            std::map<int, std::vector<int>> newIndices;
+                            for (int j = 0; j < scoreIndexPairs.size(); ++j)
+                            {
+                                int label = scoreIndexPairs[j].second.first;
+                                int idx = scoreIndexPairs[j].second.second;
+                                newIndices[label].push_back(idx);
+                            }
+                            allIndices.push_back(newIndices);
+                            numKept += attrs.top_k;
+                        }
+                        else
+                        {
+                            allIndices.push_back(indices);
+                            numKept += numDet;
+                        }
+                    }
+
+                    int count = 0;
+                    for (int i = 0; i < numImages; ++i)
+                    {
+                        const std::map<int, std::vector<dataType>>& confScores = confPreds[i];
+                        const LabelBBox& decodeBboxesImage = decodeBboxes[i];
+                        for (auto it = allIndices[i].begin(); it != allIndices[i].end(); ++it)
+                        {
+                            int label = it->first;
+                            const std::vector<dataType>& scores = confScores.find(label)->second;
+                            int loc_label = attrs.share_location ? -1 : label;
+                            const std::vector<NormalizedBBox>& bboxes =
+                                decodeBboxesImage.find(loc_label)->second;
+                            std::vector<int>& indices = it->second;
+                            for (int j = 0; j < indices.size(); ++j)
+                            {
+                                int idx = indices[j];
+                                result[count * 7 + 0] = i;
+                                result[count * 7 + 1] =
+                                    attrs.decrease_label_id ? (label - 1) : label;
+                                result[count * 7 + 2] = scores[idx];
+                                const NormalizedBBox& bbox = bboxes[idx];
+
+                                dataType xmin = bbox.xmin;
+                                dataType ymin = bbox.ymin;
+                                dataType xmax = bbox.xmax;
+                                dataType ymax = bbox.ymax;
+
+                                if (attrs.clip_after_nms)
+                                {
+                                    xmin = std::max<dataType>(0, std::min<dataType>(1, xmin));
+                                    ymin = std::max<dataType>(0, std::min<dataType>(1, ymin));
+                                    xmax = std::max<dataType>(0, std::min<dataType>(1, xmax));
+                                    ymax = std::max<dataType>(0, std::min<dataType>(1, ymax));
+                                }
+
+                                result[count * 7 + 3] = xmin;
+                                result[count * 7 + 4] = ymin;
+                                result[count * 7 + 5] = xmax;
+                                result[count * 7 + 6] = ymax;
+                                ++count;
+                            }
+                        }
+                    }
+                    if (count < numImages * attrs.keep_top_k[0])
+                    {
+                        result[count * 7 + 0] = -1;
+                    }
+                }
+            };
+        } // namespace reference
+    }     // namespace runtime
+} // namespace ngraph