[CPU] DetectionOutput perf fix (#19773)

2023-09-28 19:36:12 +08:00 · 2023-09-28 19:36:12 +08:00 · c5320bf7f0
commit c5320bf7f0
parent 23e602f06f
3 changed files with 214 additions and 134 deletions
--- a/src/plugins/intel_cpu/src/nodes/detection_output.cpp
+++ b/src/plugins/intel_cpu/src/nodes/detection_output.cpp
@ -84,6 +84,7 @@ DetectionOutput::DetectionOutput(const std::shared_ptr<ngraph::Node>& op, const
    imgWidth = attributes.input_width;
    priorSize = normalized ? 4 : 5;
    coordOffset = normalized ? 0 : 1;
+    cacheSizeL3 = utils::get_cache_size(3, true);

    withAddBoxPred = getOriginalInputsNumber() == 5;
    objScore = attributes.objectness_score;
@ -123,14 +124,13 @@ void DetectionOutput::prepareParams() {
        confInfoForPrior.resize(imgNum * priorsNum);

    // confs...count...indices for caffe style and sparsity case.
-    // caffe: conf_info for sparsity or indices for dense --> topk(buffer) --> nms(indices)
+    // caffe: filter(conf_info for sparsity or indices for dense) --> topk(buffer) --> nms(indices)
    //        --> g_topk(vector<>(all detections) --> indices per class))
    // MXNet: max conf for prior within img, filter(indices) --> topk_img(buffer) --> nms_cls(indices)
    //        --> g_topk(vector<>(all detections) --> indices per class))
-    unsigned cacheSizeL3 = utils::get_cache_size(3, true);
    isSparsityWorthwhile =
        (confidenceThreshold > sparsityThreshold) &&
-        ((classesNum * priorsNum * sizeof(float) * 2) > cacheSizeL3);
+        ((classesNum * priorsNum * sizeof(float) * 2) > static_cast<size_t>(cacheSizeL3));
    confInfoLen = (!decreaseClassId && isSparsityWorthwhile) ? (2 * priorsNum + 1) : priorsNum;
    reorderedConf.resize(imgNum * classesNum * confInfoLen);

@ -202,12 +202,6 @@ void DetectionOutput::execute(dnnl::stream strm) {

    if (!isSparsityWorthwhile) {
        confReorderDense(confData, ARMConfData, reorderedConfData);
-
-        if (!decreaseClassId) {
-            confFilterCF(reorderedConfData, indicesData, indicesBufData, detectionsData);
-        } else {
-            confFilterMX(confData, ARMConfData, reorderedConfData, indicesData, indicesBufData, detectionsData);
-        }
    } else { // sparsity
        if (!decreaseClassId) {
            confReorderAndFilterSparsityCF(confData, ARMConfData, reorderedConfData, indicesData, indicesBufData, detectionsData);
@ -270,10 +264,15 @@ void DetectionOutput::execute(dnnl::stream strm) {
            // Caffe style
            parallel_for(classesNum, [&](int c) {
                if (c != backgroundClassId) {  // Ignore background class
-                    int *pindices    = indicesData + n * classesNum * priorsNum + c * priorsNum;
-                    int *pbuffer     = indicesBufData + n * classesNum * priorsNum + c * priorsNum;
+                    const int off = n * priorsNum * classesNum + c * priorsNum;
+                    const float *pconfReorder = reorderedConfData + off;
+                    int *pindices = indicesData + off;
+                    int *pbuffer = indicesBufData + off;
                    int *pdetections = detectionsData + n * classesNum + c;

+                    if (!isSparsityWorthwhile)
+                        confFilterCF(pconfReorder, pindices, pbuffer, pdetections, n);
+
                    const float *pboxes;
                    const float *psizes;
                    if (isShareLoc) {
@ -289,9 +288,16 @@ void DetectionOutput::execute(dnnl::stream strm) {
            });
        } else {
            // MXNet style
-            int *pbuffer = indicesBufData + n * classesNum * priorsNum;
+            const int offImg = n * priorsNum * classesNum;
+            const float *pconf = confData + offImg;
+            float *pconfReorder = reorderedConfData + offImg;
+            int *pbuffer = indicesBufData + offImg;
+            int *pindices = indicesData + offImg;
            int *pdetections = detectionsData + n * classesNum;
-            int *pindices = indicesData + n * classesNum * priorsNum;
+
+            if (!isSparsityWorthwhile)
+                confFilterMX(pconf, ARMConfData, pconfReorder, pindices, pbuffer, pdetections, n);
+
            const float *pboxes = decodedBboxesData + n * 4 * locNumForClasses * priorsNum;
            const float *psizes = bboxSizesData + n * locNumForClasses * priorsNum;

@ -309,7 +315,7 @@ void DetectionOutput::execute(dnnl::stream strm) {

            std::mutex mtx;
            parallel_for(classesNum, [&](int c) {
-                int detections = detectionsData[n * classesNum + c];
+                const int detections = detectionsData[n * classesNum + c];
                int *pindices = indicesData + n * classesNum * priorsNum + c * priorsNum;

                float *pconf  = reorderedConfData + n * classesNum * confInfoLen + c * confInfoLen;
@ -330,8 +336,8 @@ void DetectionOutput::execute(dnnl::stream strm) {
            memset(detectionsData + n * classesNum, 0, classesNum * sizeof(int));

            for (size_t j = 0; j < confIndicesClassMap.size(); ++j) {
-                int cls = confIndicesClassMap[j].second.first;
-                int pr = confIndicesClassMap[j].second.second;
+                const int cls = confIndicesClassMap[j].second.first;
+                const int pr = confIndicesClassMap[j].second.second;
                int *pindices = indicesData + n * classesNum * priorsNum + cls * priorsNum;
                pindices[detectionsData[n * classesNum + cls]] = pr;
                detectionsData[n * classesNum + cls]++;
@ -343,6 +349,85 @@ void DetectionOutput::execute(dnnl::stream strm) {
    generateOutput(reorderedConfData, indicesData, detectionsData, decodedBboxesData, dstData);
 }

+inline void DetectionOutput::confFilterCF(const float* pconf, int* pindices, int* pbuffer, int* detectionsData, const int& n) {
+    // in:  reorderedConf
+    // out: pindices count
+    int count = 0;
+    for (int i = 0; i < numPriorsActual[n]; ++i) {
+        if (pconf[i] > confidenceThreshold) {
+            pindices[count] = i;
+            count++;
+        }
+    }
+
+    // in:  pindices count
+    // out: buffer detectionCount
+    int k = (topK == -1 ? count : (std::min)(topK, count));
+    topk(pindices, pbuffer, pconf, count, k);
+    detectionsData[0] = k;
+}
+
+// MX filter is per image filter, max output is prior num(select max for all class within this prior)
+// NMS is per class, keep topk is per image, final output is per class
+inline void DetectionOutput::confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData,
+    int* indicesData, int* indicesBufData, int* detectionsData, const int& n) {
+    std::mutex mtx;
+    parallel_for(numPriorsActual[n], [&](size_t p) {
+        // in:  origin conf
+        // out: pindices, detectionCount
+        // intentionally code branch from higher level
+        if (withAddBoxPred) {
+            const bool isARMPrior = ARMConfData[n*priorsNum*2 + p * 2 + 1] < objScore;
+            float maxConf = -1;
+            int maxCIdx = 0;
+            for (int c = 1; c < classesNum; ++c) {
+                float conf = confData[p * classesNum + c];
+                if (isARMPrior)
+                    conf = (c == backgroundClassId) ? 1.0f : 0.0f;  // still need refresh conf due to read from origin conf
+                if (conf >= confidenceThreshold && conf > maxConf) {
+                    maxConf = conf;
+                    maxCIdx = c;
+                }
+            }
+            if (maxCIdx > 0) {
+                // include this prior
+                mtx.lock();
+                indicesData[detectionsData[0]] = maxCIdx*priorsNum + p;  // de-refer to get prior and class id.
+                detectionsData[0]++;
+                mtx.unlock();
+            }
+        } else {
+            float maxConf = -1;
+            int maxCIdx = 0;
+            for (int c = 1; c < classesNum; ++c) {
+                float conf = confData[p * classesNum + c];
+                if (conf >= confidenceThreshold && conf > maxConf) {
+                    maxConf = conf;
+                    maxCIdx = c;
+                }
+            }
+            if (maxCIdx > 0) {
+                // include this prior and class with max conf
+                mtx.lock();
+                indicesData[detectionsData[0]] = maxCIdx*priorsNum + p;  // de-refer to get prior and class id.
+                detectionsData[0]++;
+                mtx.unlock();
+            }
+        }
+    });
+
+    // in:  pindices, detectionCount(filtered num)
+    // out: buffer, detectionCount(k)
+    int count = detectionsData[0];
+    int k = (topK == -1 ? count : (std::min)(topK, count));
+
+    const float *pconf = reorderedConfData;
+    // int *indices = indicesData;
+    // int *pbuffer = indicesBufData;
+    topk(indicesData, indicesBufData, pconf, count, k);
+    detectionsData[0] = k;
+}
+
 inline void DetectionOutput::getActualPriorNum(const float *priorData, int* numPriorsActual, int n) {
    numPriorsActual[n] = priorsNum;
    if (!normalized) {
@ -374,7 +459,7 @@ inline void DetectionOutput::confReorderDense(const float *confData, const float
    }
    // withAddBoxPred is false
    parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) {
-        int offset = n * priorsNum * classesNum;
+        const int offset = n * priorsNum * classesNum;
        for (int p = 0; p < priorsNum; ++p) {
            reorderedConfData[offset + c * priorsNum + p] =
            confData[offset + p * classesNum + c];
@ -382,108 +467,17 @@ inline void DetectionOutput::confReorderDense(const float *confData, const float
    });
 }

-inline void DetectionOutput::confFilterCF(float* reorderedConfData, int* indicesData, int* indicesBufData, int* detectionsData) {
-    parallel_for2d(imgNum, classesNum, [&](size_t n, size_t c) {
-        // in:  reorderedConf
-        // out: pindices count
-        if (c == static_cast<size_t>(backgroundClassId))
-            return;
-        int off = n * priorsNum * classesNum + c * priorsNum;
-        const float *pconf = reorderedConfData + off;
-        int *pindices = indicesData + off;
-        int *pbuffer = indicesBufData + off;
-
-        int count = 0;
-        for (int i = 0; i < numPriorsActual[n]; ++i) {
-            if (pconf[i] > confidenceThreshold) {
-                pindices[count] = i;
-                count++;
-            }
-        }
-
-        // in:  pindices count
-        // out: buffer detectionCount
-        int k = (topK == -1 ? count : (std::min)(topK, count));
-        topk(pindices, pbuffer, pconf, count, k);
-        detectionsData[n*classesNum + c] = k;
-    });
-}
-
-// MX filter is per image filter, max output is prior num(select max for all class within this prior)
-// NMS is per class, keep topk is per image, final output is per class
-inline void DetectionOutput::confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData,
-    int* indicesData, int* indicesBufData, int* detectionsData) {
-    for (int n = 0; n < imgNum; ++n) {
-        int offB = n * priorsNum * classesNum;
-        std::mutex mtx;
-        parallel_for(numPriorsActual[n], [&](size_t p) {
-            // in:  origin conf
-            // out: pindices, detectionCount
-            // intentionally code branch from higher level
-            if (withAddBoxPred) {
-                bool isARMPrior = ARMConfData[n*priorsNum*2 + p * 2 + 1] < objScore;
-                float maxConf = -1;
-                int maxCIdx = 0;
-                for (int c = 1; c < classesNum; ++c) {
-                    float conf = confData[offB + p * classesNum + c];
-                    if (isARMPrior)
-                        conf = (c == backgroundClassId) ? 1.0f : 0.0f;  // still need refresh conf due to read from origin conf
-                    if (conf >= confidenceThreshold && conf > maxConf) {
-                        maxConf = conf;
-                        maxCIdx = c;
-                    }
-                }
-                if (maxCIdx > 0) {
-                    // include this prior
-                    mtx.lock();
-                    indicesData[offB + detectionsData[n*classesNum]] = maxCIdx*priorsNum + p;  // de-refer to get prior and class id.
-                    detectionsData[n*classesNum]++;
-                    mtx.unlock();
-                }
-            } else {
-                float maxConf = -1;
-                int maxCIdx = 0;
-                for (int c = 1; c < classesNum; ++c) {
-                    float conf = confData[offB + p * classesNum + c];
-                    if (conf >= confidenceThreshold && conf > maxConf) {
-                        maxConf = conf;
-                        maxCIdx = c;
-                    }
-                }
-                if (maxCIdx > 0) {
-                    // include this prior and class with max conf
-                    mtx.lock();
-                    indicesData[offB + detectionsData[n*classesNum]] = maxCIdx*priorsNum + p;  // de-refer to get prior and class id.
-                    detectionsData[n*classesNum]++;
-                    mtx.unlock();
-                }
-            }
-        });
-
-        // in:  pindices, detectionCount(filtered num)
-        // out: buffer, detectionCount(k)
-        int count = detectionsData[n*classesNum];
-        int k = (topK == -1 ? count : (std::min)(topK, count));
-
-        const float *pconf = reorderedConfData + offB;
-        int *indices = indicesData + offB;
-        int *pbuffer = indicesBufData + offB;
-        topk(indices, pbuffer, pconf, count, k);
-        detectionsData[n * classesNum] = k;
-    }
-}
-
 inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confData, const float* ARMConfData, float* reorderedConfData,
    int* indicesData, int* indicesBufData, int* detectionsData) {
    int* reorderedConfDataIndices = reinterpret_cast<int*>(reorderedConfData);
    for (int n = 0; n < imgNum; ++n) {
-        int off = n * priorsNum * classesNum;
-        int offV = n * priorsNum;  // vertical info
+        const int off = n * priorsNum * classesNum;
+        const int offV = n * priorsNum;  // vertical info

-        int offH = n * confInfoLen * classesNum; // horizontal info
+        const int offH = n * confInfoLen * classesNum; // horizontal info
        // reset count
        parallel_for(classesNum, [&](size_t c) {
-            int countIdx = offH + c * confInfoLen + priorsNum;
+            const int countIdx = offH + c * confInfoLen + priorsNum;
            reorderedConfDataIndices[countIdx] = 0;
        });

@ -491,7 +485,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
        parallel_for(numPriorsActual[n], [&](size_t p) {
            // intentionally code branch from higher level
            if (withAddBoxPred) {
-                bool isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore;
+                const bool isARMPrior = ARMConfData[n * priorsNum * 2 + p * 2 + 1] < objScore;
                bool priorStatusSet = false;
                if (isShareLoc)
                    confInfoForPrior[offV + p] = -1;
@ -501,7 +495,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
                    if (isARMPrior)
                        conf = (c == backgroundClassId) ? 1.0f : 0.0f;
                    if (conf > confidenceThreshold) {
-                        int idx = offH + c * confInfoLen;
+                        const int idx = offH + c * confInfoLen;
                        reorderedConfData[idx + p] = conf;
                        mtx.lock();
                        reorderedConfDataIndices[idx + priorsNum]++;
@ -522,7 +516,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
                for (int c = 0; c < classesNum; ++c) {
                    float conf = confData[confIdxPrior + c];
                    if (conf > confidenceThreshold) {
-                        int idx = offH + c * confInfoLen;
+                        const int idx = offH + c * confInfoLen;
                        reorderedConfData[idx + p] = conf;
                        mtx.lock();
                        reorderedConfDataIndices[idx + priorsNum]++;
@ -542,9 +536,9 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
            // out: buffer, detectionCount(k)
            if (c == static_cast<size_t>(backgroundClassId))  // Ignore background class
                return;
-            int countIdx = offH + c * confInfoLen + priorsNum;
-            int count = reorderedConfDataIndices[countIdx];
-            int k = (topK == -1 ? count : (std::min)(topK, count));
+            const int countIdx = offH + c * confInfoLen + priorsNum;
+            const int count = reorderedConfDataIndices[countIdx];
+            const int k = (topK == -1 ? count : (std::min)(topK, count));

            int *reorderedConfIndices = reorderedConfDataIndices + countIdx + 1;
            int *pbuffer = indicesBufData + off + c * priorsNum;
@ -559,8 +553,8 @@ inline void DetectionOutput::confReorderAndFilterSparsityCF(const float* confDat
 inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confData, const float* ARMConfData, float* reorderedConfData,
    int* indicesData, int* indicesBufData, int* detectionsData) {
    for (int n = 0; n < imgNum; ++n) {
-        int off = n * priorsNum * classesNum;
-        int offV = n * priorsNum;  // vertical info
+        const int off = n * priorsNum * classesNum;
+        const int offV = n * priorsNum;  // vertical info

        std::mutex mtx;
        parallel_for(numPriorsActual[n], [&](size_t p) {
@ -605,8 +599,8 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat
        // topk
        // in:  indicesData, detection_count(filtered num)
        // out: buffer, detection_count(k)
-        int count = detectionsData[n * classesNum];
-        int k = (topK == -1 ? count : (std::min)(topK, count));
+        const int count = detectionsData[n * classesNum];
+        const int k = (topK == -1 ? count : (std::min)(topK, count));

        const float *pconf = reorderedConfData + off;
        int *indices = indicesData + off;
@ -616,6 +610,7 @@ inline void DetectionOutput::confReorderAndFilterSparsityMX(const float* confDat
    }
 }

+// apply locData(offset) to priordata, generate decodedBox
 inline void DetectionOutput::decodeBBoxes(const float *priorData,
                                       const float *locData,
                                       const float *varianceData,
@ -729,15 +724,15 @@ static inline float JaccardOverlap(const float *decodedBbox,
                                   const float *bboxSizes,
                                   const int idx1,
                                   const int idx2) {
-    float xmin1 = decodedBbox[idx1 * 4 + 0];
-    float ymin1 = decodedBbox[idx1 * 4 + 1];
-    float xmax1 = decodedBbox[idx1 * 4 + 2];
-    float ymax1 = decodedBbox[idx1 * 4 + 3];
+    const float xmin1 = decodedBbox[idx1 * 4 + 0];
+    const float ymin1 = decodedBbox[idx1 * 4 + 1];
+    const float xmax1 = decodedBbox[idx1 * 4 + 2];
+    const float ymax1 = decodedBbox[idx1 * 4 + 3];

-    float xmin2 = decodedBbox[idx2 * 4 + 0];
-    float ymin2 = decodedBbox[idx2 * 4 + 1];
-    float xmax2 = decodedBbox[idx2 * 4 + 2];
-    float ymax2 = decodedBbox[idx2 * 4 + 3];
+    const float xmin2 = decodedBbox[idx2 * 4 + 0];
+    const float ymin2 = decodedBbox[idx2 * 4 + 1];
+    const float xmax2 = decodedBbox[idx2 * 4 + 2];
+    const float ymax2 = decodedBbox[idx2 * 4 + 3];

    if (xmin2 > xmax1 || xmax2 < xmin1 || ymin2 > ymax1 || ymax2 < ymin1) {
        return 0.0f;
--- a/src/plugins/intel_cpu/src/nodes/detection_output.h
+++ b/src/plugins/intel_cpu/src/nodes/detection_output.h
@ -60,6 +60,7 @@ private:
    int imgWidth = 0;
    int imgHeight = 0;
    int coordOffset = 0;
+    int cacheSizeL3 = 0;

    enum CodeType {
        CORNER = 1,
@ -73,10 +74,10 @@ private:

    inline void confReorderDense(const float* confData, const float* ARMConfData, float* reorderedConfData);

-    inline void confFilterCF(float* reorderedConfData, int* indicesData, int* indicesBufData, int* detectionsData);
+    inline void confFilterCF(const float* pConf, int* pindices, int* pbuffer, int* detectionsData, const int& n);

    inline void confFilterMX(const float* confData, const float* ARMConfData, float* reorderedConfData,
-        int* indicesData, int* indicesBufData, int* detectionsData);
+        int* indicesData, int* indicesBufData, int* detectionsData, const int& n);

    inline void confReorderAndFilterSparsityCF(const float* confData, const float* ARMConfData, float* reorderedConfData,
        int* indicesData, int* indicesBufData, int* detectionsData);
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/detection_output.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/detection_output.cpp
@ -90,8 +90,6 @@ public:
            inShapes[i].first[0] = batch;
        }

-
-
        std::ostringstream result;
        result << "IS = { ";

@ -373,6 +371,57 @@ INSTANTIATE_TEST_SUITE_P(
        params3InputsDynamic,
        DetectionOutputLayerCPUTest::getTestCaseName);

+//////////////////large tensor/////////////////
+// There are two major implemenation for DO node, sparsity and dense manner.
+// This test config(shapes, threshold...) go to sparsity path in most machines(as long as L3 per core cache is smaller than 8M).
+const std::vector<ParamsWhichSizeDependsDynamic> specificParams3InDynamicLargeTensor = {
+    // dynamic input shapes
+    ParamsWhichSizeDependsDynamic {
+        true, true, true, 1, 1,
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 381360}, {1, 381360}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1048740}, {1, 1048740}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1, 381360}, {1, 1, 381360}}},
+        {},
+        {}
+    },
+    ParamsWhichSizeDependsDynamic {
+        false, true, true, 1, 1,
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 381360}, {1, 381360}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1048740}, {1, 1048740}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1, 381360}, {1, 1, 381360}}},
+        {},
+        {}
+    },
+};
+
+const std::vector<float> confThreshold = {0.032f, 0.88f};
+const auto commonAttributesLargeTensor = ::testing::Combine(
+    ::testing::Values(numClasses),
+    ::testing::Values(backgroundLabelId),
+    ::testing::ValuesIn(topK),
+    ::testing::ValuesIn(keepTopK),
+    ::testing::ValuesIn(codeType),
+    ::testing::Values(nmsThreshold),
+    ::testing::ValuesIn(confThreshold),
+    ::testing::ValuesIn(clipAfterNms),
+    ::testing::ValuesIn(clipBeforeNms),
+    ::testing::Values(false)
+);
+
+const auto params3InputsDynamicLargeTensor = ::testing::Combine(
+        commonAttributesLargeTensor,
+        ::testing::ValuesIn(specificParams3InDynamicLargeTensor),
+        ::testing::ValuesIn(numberBatch),
+        ::testing::Values(0.0f),
+        ::testing::Values(false, true),
+        ::testing::Values(ov::test::utils::DEVICE_CPU)
+);
+INSTANTIATE_TEST_SUITE_P(
+        CPUDetectionOutputDynamic3InLargeTensor,
+        DetectionOutputLayerCPUTest,
+        params3InputsDynamicLargeTensor,
+        DetectionOutputLayerCPUTest::getTestCaseName);
+
 /* =============== 5 inputs cases =============== */

 const std::vector<ParamsWhichSizeDependsDynamic> specificParams5InDynamic = {
@ -459,5 +508,40 @@ INSTANTIATE_TEST_SUITE_P(
        params5InputsDynamic,
        DetectionOutputLayerCPUTest::getTestCaseName);

+//////////////////large tensor/////////////////
+const std::vector<ParamsWhichSizeDependsDynamic> specificParams5InDynamicLargeTensor = {
+    // dynamic input shapes
+    ParamsWhichSizeDependsDynamic {
+        true, true, true, 1, 1,
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 381360}, {1, 381360}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1048740}, {1, 1048740}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1, 381360}, {1, 1, 381360}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 190680}, {1, 190680}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 381360}, {1, 381360}}},
+    },
+    ParamsWhichSizeDependsDynamic {
+        true, false, true, 1, 1,
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 4194960}, {1, 4194960}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1048740}, {1, 1048740}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 1, 381360}, {1, 1, 381360}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 190680}, {1, 190680}}},
+        {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{1, 4194960}, {1, 4194960}}},
+    },
+};
+const auto params5InputsDynamicLargeTensor = ::testing::Combine(
+        commonAttributesLargeTensor,
+        ::testing::ValuesIn(specificParams5InDynamicLargeTensor),
+        ::testing::ValuesIn(numberBatch),
+        ::testing::Values(objectnessScore),
+        ::testing::Values(false, true),
+        ::testing::Values(ov::test::utils::DEVICE_CPU)
+);
+
+INSTANTIATE_TEST_SUITE_P(
+        CPUDetectionOutputDynamic5InLargeTensor,
+        DetectionOutputLayerCPUTest,
+        params5InputsDynamicLargeTensor,
+        DetectionOutputLayerCPUTest::getTestCaseName);
+
 }  // namespace
 }  // namespace CPULayerTestsDefinitions