[CPU] Extend Concat node logic to avoid fallback on slow ref implementation. (#4129)

2021-05-31 18:49:57 +03:00 · 2021-05-31 18:49:57 +03:00 · 7fb9bac24a
commit 7fb9bac24a
parent 315c8d4eec
7 changed files with 411 additions and 339 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
@ -134,6 +134,16 @@ PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &d
    return res;
 }

+
+PartialBlkDesc PartialBlkDesc::makeTailC(const InferenceEngine::SizeVector &dims) {
+    PartialBlkDesc res = makePlain(dims);
+    if (dims.size() > 2) {
+        auto itr = res.outer_order.begin() + 1;
+        std::rotate(itr, itr + 1, res.outer_order.end());
+    }
+    return res;
+}
+
 PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
    if (desc.getLayout() == InferenceEngine::ANY)
        IE_THROW() << "Cannot extract partial blocked descriptor for `ANY` layout";
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
@ -59,6 +59,9 @@ public:
    /** Construct blocked Channel PartialBlkDesc based on dims information */
    static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);

+    /** Construct per Channel PartialBlkDesc based on dims information */
+    static PartialBlkDesc makeTailC(const InferenceEngine::SizeVector &dims);
+
    /** Compare operators. Allow to use it as key for std::map */
    bool operator == (const PartialBlkDesc& it) const;
    bool operator < (const PartialBlkDesc& it) const;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
@ -21,11 +21,15 @@
 #include "mkldnn_eltwise_node.h"
 #include <limits>
 #include "common/cpu_memcpy.h"
+#include "common/tensor_desc_creator.h"

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;

+namespace {
+    constexpr size_t channelAxis = 1lu;
+}

 bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
    try {
@ -89,308 +93,120 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
        }
    }

-    // MKLDNN doesn't support different precision on inputs so fallback on FP32 in such case
+    // Concat doesn't support different precision on inputs so fallback on FP32 in such case
    if (isMixedPrecision)
        inputPrecision = Precision::FP32;

-    // Concat node supports int8 implementations only for NHWC and NDHWC layouts
-    if (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) {
-        int ndims = getChildEdgeAt(0)->getDims().ndims();
-        if (ndims != 2 && ndims != 4 && ndims != 5)
-            inputPrecision = Precision::FP32;
-    }
-
-    // MKLDNN supports only equal precisions for inputs and output
+    // Concat supports only equal precisions for inputs and output
    outputPrecision = inputPrecision;

-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
-    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
+    auto& dstDims = getChildEdgeAt(0)->getDims();
+    std::vector<TensorDescCreatorTypes> tdCreatorTypes = {TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc};

-    MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
-    InferenceEngine::LayerConfig config;
-    config.dynBatchSupport = true;
+    // check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow oneDNN ref implementation
+    if (dstDims.ndims() > channelAxis) {
+        for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c)}) {
+            SizeVector blkDims = dstDims.ToSizeVector();
+            if (blkDims[channelAxis] % item.first)
+                continue;

-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        auto parentEdge = getParentEdgeAt(i);
-
-        InferenceEngine::DataConfig dataConfig;
-        dataConfig.inPlace = -1;
-        dataConfig.constant = false;
-        auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format_tag::nc :
-                                                                                          parentEdge->getDims().ndims() == 4 ? memory::format_tag::nhwc :
-                                                                                                                               memory::format_tag::ndhwc
-                                                                                        : memory::format_tag::any;
-
-        dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, fmt));
-        config.inConfs.push_back(dataConfig);
-    }
-
-    auto dims = getChildEdgeAt(0)->getDims();
-
-    config.outConfs.resize(1);
-    config.outConfs[0].inPlace = -1;
-    config.outConfs[0].constant = false;
-    if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
-        auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format_tag::nc :
-                                                                                          dims.ndims() == 4 ? memory::format_tag::nhwc :
-                                                                                                              memory::format_tag::ndhwc
-                                                                                        : MKLDNNMemory::GetPlainFormat(dims);
-
-        config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, fmt));
-        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, fmt);
-
-        if (inputPrecision != Precision::U8 && inputPrecision != Precision::I8) {
-            if (dims.ndims() == 4) {
-                if (dims[1] % 8 == 0) {
-                    config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                            MKLDNNMemoryDesc(dims, outputDataType, memory::format_tag::nChw8c));
-                    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, memory::format_tag::nChw8c);
-
-                    if (dims[1] % 16 == 0) {
-                        config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                                MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nChw16c));
-                        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nChw16c);
-                    }
-                }
-            } else if (dims.ndims() == 5) {
-                if (dims[1] % 8 == 0) {
-                    config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                            MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw8c));
-                    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw8c);
-
-                    if (dims[1] % 16 == 0) {
-                        config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                                MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw16c));
-                        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw16c);
-                    }
+            bool blocked = true;
+            for (size_t i = 0; i < getParentEdges().size(); i++) {
+                auto& srcDims = getParentEdgeAt(i)->getDims();
+                if (srcDims[channelAxis] % item.first) {
+                    blocked = false;
+                    break;
                }
            }
+            if (blocked) {
+                tdCreatorTypes.push_back(item.second);
+            }
        }
    }

-    if (axis != 1)
+    std::vector<size_t> pdIndexesToReuse;
+
+    auto& creatorsMap = TensorDescCreator::getCommonCreators();
+    auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast<unsigned>(dstDims.ndims()), tdCreatorTypes);
+    for (auto itr = itrRange.first; itr != itrRange.second; ++itr) {
+        InferenceEngine::LayerConfig config;
+
+        config.dynBatchSupport = true;
+        config.outConfs.resize(1);
+        config.outConfs[0].inPlace = -1;
+        config.outConfs[0].constant = false;
+        config.outConfs[0].desc = itr->second->createDesc(outputPrecision, dstDims.ToSizeVector());
+        memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat();
+
+        config.inConfs.resize(getParentEdges().size());
+
+        for (size_t i = 0; i < getParentEdges().size(); ++i) {
+            config.inConfs[i].inPlace = -1;
+            config.inConfs[i].constant = false;
+            config.inConfs[i].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
+                    itr->second->createDesc(inputPrecision, getParentEdgeAt(i)->getDims().ToSizeVector()));
+        }
+        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFmt);
+        if (itr->first != TensorDescCreatorTypes::nspc) {
+            pdIndexesToReuse.push_back(supportedPrimitiveDescriptors.size() - 1);
+        }
+    }
+
+    if (axis != channelAxis)
        return;

-    auto numOfDim = static_cast<size_t>(dstDims.ndims());
+    // Optimized inplace case

-    SizeVector order(numOfDim);
-    SizeVector offsets(numOfDim, 0lu);
-    size_t offset = (std::numeric_limits<size_t>::max)();
-    for (size_t i = 0; i < numOfDim; i++) {
-        order[i] = i;
-    }
+    for (auto refPdIndex : pdIndexesToReuse) {
+        const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig();
+        auto config = refConfig;

-    if (outputPrecision == Precision::I8 || outputPrecision == Precision::U8) {
-        if (numOfDim == 4) {
-            // Here we assume NHWC layout (channels are the last)
+        const auto& order = refConfig.outConfs[0].desc.getBlockingDesc().getOrder();
+        const auto& blkDims = refConfig.outConfs[0].desc.getBlockingDesc().getBlockDims();
+        auto numOfDim = blkDims.size();

-            order = {0, 2, 3, 1};
-            offsets = {0, 0, 0, 0};
+        SizeVector offsets(numOfDim, 0lu);
+        SizeVector strides(numOfDim);
+        strides.back() = 1lu;
+        size_t offset = (std::numeric_limits<size_t>::max)();

-            SizeVector blkDims = dstDims.ToSizeVector();
-            blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
-
-            SizeVector strides(numOfDim);
-            strides.resize(numOfDim);
-            // C is the last in NHWC, so all strides are max()
-            for (size_t i = 0; i < numOfDim; i++) {
-                strides[i] = (std::numeric_limits<size_t>::max)();
-            }
-
-            config.outConfs[0].desc = TensorDesc(outputPrecision,
-                                                 dstDims.ToSizeVector(),
-                                                 { blkDims, order, offset, offsets, strides });
-            for (size_t i = 0; i < getParentEdges().size(); i++) {
-                auto parentEdge = getParentEdgeAt(i);
-
-                SizeVector blkDims = parentEdge->getDims().ToSizeVector();
-                blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
-
-                config.inConfs[i].inPlace = -1;     // Change to 0 here if inplace concat is supported for NHWC in mkldnn
-
-                config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(),
-                                                    {blkDims, order, offset, offsets, strides});
-            }
-
-            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nhwc);
-
-            return;
-        } else if (numOfDim == 5) {
-            // Here we assume NDHWC layout (channels are the last)
-
-            order = {0, 2, 3, 4, 1};
-            offsets = {0, 0, 0, 0, 0};
-
-            SizeVector blkDims = dstDims.ToSizeVector();
-            blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] };
-
-            SizeVector strides(numOfDim);
-            strides.resize(numOfDim);
-            // C is the last in NDHWC, so all strides are max()
-            for (size_t i = 0; i < numOfDim; i++) {
-                strides[i] = (std::numeric_limits<size_t>::max)();
-            }
-
-            config.outConfs[0].desc = TensorDesc(outputPrecision,
-                                                 dstDims.ToSizeVector(),
-                                                 { blkDims, order, offset, offsets, strides });
-            for (size_t i = 0; i < getParentEdges().size(); i++) {
-                auto parentEdge = getParentEdgeAt(i);
-
-                SizeVector blkDims = parentEdge->getDims().ToSizeVector();
-                blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] };
-
-                config.inConfs[i].inPlace = -1;     // Change to 0 here if inplace concat is supported for NDHWC in mkldnn
-
-                config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(),
-                                                    {blkDims, order, offset, offsets, strides});
-            }
-
-            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::ndhwc);
-
-            return;
-        }
-    }
-
-    SizeVector strides(numOfDim);
-    strides[numOfDim - 1] = 1;
-    for (size_t i = 2; i <= numOfDim; i++) {
-        if (numOfDim - i < axis) {
-            strides[numOfDim - i] = (std::numeric_limits<size_t>::max)();
-        } else {
-            strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1];
-        }
-    }
-
-    config.outConfs[0].desc = TensorDesc(
-            MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
-            dstDims.ToSizeVector(),
-            {dstDims.ToSizeVector(), order, offset, offsets, strides});
-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        auto parentEdge = getParentEdgeAt(i);
-        config.inConfs[i].inPlace = 0;
-        config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
-                                            {parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides});
-    }
-
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, MKLDNNMemory::Convert(config.outConfs[0].desc.getLayout()));
-
-    if (numOfDim == 4lu || numOfDim == 5lu) {
-        size_t blkDimsLen = numOfDim + 1;
-        order.resize(blkDimsLen);
-        for (size_t i = 0; i < numOfDim; i++) {
-            order[i] = i;
-        }
-        order[numOfDim] = 1lu;
-        offsets = SizeVector(blkDimsLen, 0lu);
-
-        // nChw8c, nChw16c, nCdhw8c, nCdhw16c
-        for (size_t sizeS : {8lu, 16lu}) {
-            SizeVector blkDims = dstDims.ToSizeVector();
-            if (blkDims[1] % sizeS)
-                continue;
-            blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
-            blkDims.push_back(sizeS);
-
-            strides.resize(blkDimsLen);
-            strides[blkDimsLen - 1] = 1;
-            for (size_t i = 2lu; i <= blkDimsLen; i++) {
-                if (blkDimsLen - i < axis) {
-                    strides[blkDimsLen - i] = (std::numeric_limits<size_t>::max)();
-                } else {
-                    strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
-                }
-            }
-            config.outConfs[0].desc = TensorDesc(
-                    MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
-                    dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
-
-            bool canInplace = true;
-            for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
-                auto parentEdge = getParentEdgeAt(i);
-                blkDims = parentEdge->getDims().ToSizeVector();
-                if (blkDims[1] % sizeS)
-                    canInplace = false;
-
-                blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
-                blkDims.push_back(sizeS);
-                config.inConfs[i].desc =  TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
-                                                     {blkDims, order, offset, offsets, strides});
-            }
-            if (canInplace) {
-                auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::nChw16c
-                                                 : sizeS == 8lu ? mkldnn::memory::format_tag::nCdhw8c : mkldnn::memory::format_tag::nCdhw16c;
-                supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, dstFormat);
+        for (size_t i = 2; i <= numOfDim; i++) {
+            if (numOfDim - i < axis) {
+                strides[numOfDim - i] = (std::numeric_limits<size_t>::max)();
+            } else {
+                strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1];
            }
        }
+
+        config.outConfs[0].desc = TensorDesc(outputPrecision, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
+        memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat();
+
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            const auto& srcBlkDims = refConfig.inConfs[i].desc.getBlockingDesc().getBlockDims();
+            const auto& dims = refConfig.inConfs[i].desc.getDims();
+
+            config.inConfs[i].inPlace = 0;
+            config.inConfs[i].desc = TensorDesc(inputPrecision, dims, {srcBlkDims, order, offset, offsets, strides});
+        }
+        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFmt);
    }
 }

 void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
-    bool hasUnknown = false;
    std::vector<size_t> canSelectPrimitive;
-    for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
-        bool hasAny = true;
-        auto &primDescInfo = supportedPrimitiveDescriptors[i];
-        if (primDescInfo.getImplementationType() != impl_desc_type::unknown ||
-                primDescInfo.getConfig().inConfs[0].inPlace < 0)
-            continue;
-        hasUnknown = true;
-        for (auto iInfo : primDescInfo.getConfig().inConfs) {
-            if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
-                hasAny = false;
-                break;
-            }
-        }
-
-        if (hasAny) {
-            for (auto oInfo : primDescInfo.getConfig().outConfs) {
-                if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
-                    hasAny = false;
-                    break;
-                }
-            }
-        }
-
-        if (!hasAny) {
-            canSelectPrimitive.push_back(i);
-        }
-    }
-
-    bool hasDoubleConnection = false;
-    for (int i = 0; i < getParentEdges().size(); i++) {
-        for (int j = i + 1; j < getParentEdges().size(); j++) {
-            if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true;
-        }
-    }
-
-    if (hasDoubleConnection) {
-        // The double connection marks that some tensor should
-        // be replicated. Inplace approach is not applicable
-        // for that case. Descriptor with index 0 is pure copy
-        // implementation
-        selectPrimitiveDescriptorByIndex(0);
-        return;
-    }

    bool canOptimize = true;
-    for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
-        const auto& parent = getParentEdgeAt(i)->getParent();
-        for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) {
-            const auto& child = parent->getChildEdgeAt(j)->getChild();
-            const auto* childConcat = dynamic_cast<MKLDNNConcatNode *>(child.get());
-            if (!childConcat || childConcat == this)
-                continue;
-            if (childConcat->isOptimized())
-                canOptimize = false;
+
+    // The double connection marks that some tensor should
+    // be replicated. Inplace approach is not applicable
+    // for that case.
+    for (int i = 0; i < getParentEdges().size(); i++) {
+        for (int j = i + 1; j < getParentEdges().size(); j++) {
+            if (getParentEdgeAt(i) == getParentEdgeAt(j)) canOptimize = false;
        }
    }
-    if (hasUnknown && axis == 1) {
-        if (canSelectPrimitive.size() == 1) {
-            selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
-            return;
-        }
-    } else {
+
+    if (axis != channelAxis) {
        canOptimize = false;
    }

@ -432,44 +248,57 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
    }

    size_t maxCount = 0;
-    auto convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
+    auto outDims = getChildEdgeAt(0)->getDims().ToSizeVector();
+    auto convertTo = PartialBlkDesc::makePlain(outDims);
    for (auto &it : formatFrequency) {
        if (it.second > maxCount) {
            maxCount = it.second;
            convertTo = it.first;
+        } else if (it.second == maxCount) {
+            if (isInQuantizedGraph && it.first == PartialBlkDesc::makeTailC(outDims)) {
+                convertTo = it.first;
+            } else if (it.first == PartialBlkDesc::makeCBlocked(outDims, 8) || it.first == PartialBlkDesc::makeCBlocked(outDims, 16)) {
+                convertTo = it.first;
+            }
        }
    }

-    if (canOptimize && convertTo.isAutoExtendedWith(getChildEdgeAt(0)->getDims().ToSizeVector()))
-        convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
-    for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
+    if (convertTo.isAutoExtendedWith(outDims))
+        convertTo = PartialBlkDesc::makePlain(outDims);
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
        if (convertTo.isAutoExtendedWith(getParentEdgeAt(i)->getDims().ToSizeVector()))
-            convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
+            convertTo = PartialBlkDesc::makePlain(outDims);
    }

-    for (auto supportedPdIndex : canSelectPrimitive) {
-        if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc) == convertTo) {
-            selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
+    for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) {
+        if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc) == convertTo) {
+            if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, canOptimize)) {
+                canSelectPrimitive.push_back(i);
+            }
+        }
+    }
+
+    if (canSelectPrimitive.size() == 1) {
+        selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
+        return;
+    }
+
+    // if there are more then one PD with similar data layouts - select the optimized one
+    for (auto indx : canSelectPrimitive) {
+        if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) {
+            selectPrimitiveDescriptorByIndex(static_cast<int>(indx));
            return;
        }
    }

+    // if there are no matching data layouts, select first optimized implementation
    for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
-        auto &primDescInfo = supportedPrimitiveDescriptors[i];
-        if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
-            continue;
-        if (convertTo == PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc)) {
-            size_t num = 0;
-            for (num = 0; num < getParentEdges().size(); num++) {
-                if (convertTo.isAutoExtendedWith(getParentEdgeAt(num)->getDims().ToSizeVector()))
-                    break;
-            }
-            if (num == getParentEdges().size()) {
-                selectPrimitiveDescriptorByIndex(i);
-                return;
-            }
+        if (canOptimize && supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown) {
+            selectPrimitiveDescriptorByIndex(static_cast<int>(i));
+            return;
        }
    }
+
    selectPrimitiveDescriptorByIndex(0);
 }

@ -491,6 +320,12 @@ void MKLDNNConcatNode::createPrimitive() {
    if (getSelectedPrimitiveDescriptor() == nullptr)
        IE_THROW() << "Preferable primitive descriptor is not set.";

+    //check if selected Tensor descriptor has nspc layout and concat axis is C
+    if (axis == channelAxis && getChildEdgeAt(0)->getMemory().GetDesc().isTailCFormat()) {
+        canOptimizeNspc = true;
+        return;
+    }
+
    std::vector<memory::desc> srcs_d;

    for (size_t i = 0; i < getParentEdges().size(); i++) {
@ -540,7 +375,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
        if (!isInitConfig(config)) {
            for (size_t i = 0; i < config.inConfs.size(); i++) {
                config.inConfs[i].desc = getConfiguredInputDesc(config, i);
-                // MKLDNN doesn't support different precision on inputs
+                // Concat doesn't support different precision on inputs
                config.inConfs[i].desc.setPrecision(inputPrecision);
            }

@ -560,8 +395,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
        return;

    for (size_t i = 0; i < config.outConfs.size(); i++) {
-        if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY ||
-                !isUninitTensorDesc(config.outConfs[i].desc))
+        if (!isUninitTensorDesc(config.outConfs[i].desc))
            continue;

        int num = getChildEdgeAt(i)->getOutputNum();
@ -621,49 +455,53 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {
        return;
    }

-    const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
-    const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
-    const size_t num_src = getParentEdges().size();
-
-    const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
-
-    if (isInt8) {
-        uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
-
-        std::vector<size_t> channels;
-        size_t channels_size = 0;
-        std::vector<const uint8_t*> src_ptrs;
-        std::vector<uint8_t*> dst_ptrs;
-
-        for (size_t i = 0; i < num_src; i++) {
-            const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
-            const size_t num_channels = src_mem.GetDims()[1];
-
-            channels.push_back(num_channels);
-            src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
-            dst_ptrs.push_back(dst_ptr + channels_size);
-            channels_size += num_channels;
-        }
-
-        const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0];
-
-        parallel_for(iter_count, [&](int i) {
-            const size_t dst_off = i * channels_size;
-            for (int j = 0; j < num_src; j++) {
-                cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]);
-            }
-        });
-    } else {
-        std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
-        for (int i = 0; i < num_src; i++)
-            mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
-
-        (*prim).execute(strm, mem_ags);
+    if (canOptimizeNspc) {
+        execNspcSpecCase();
+        return;
    }
+
+    const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
+    const size_t num_src = getParentEdges().size();
+    std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
+    for (int i = 0; i < num_src; i++)
+        mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
+
+    (*prim).execute(strm, mem_ags);
 }

 InferenceEngine::Precision MKLDNNConcatNode::getRuntimePrecision() const {
    return MKLDNNExtensionUtils::getMaxPrecision(getInputPrecisions());
 }

+void MKLDNNConcatNode::execNspcSpecCase() {
+    const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
+    const size_t num_src = getParentEdges().size();
+    uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
+    const size_t dataSize = MKLDNNExtensionUtils::sizeOfDataType(dst_memory.GetDataType());
+
+    std::vector<size_t> channelsDataSize;
+    size_t channels_size = 0;
+    std::vector<const uint8_t*> src_ptrs;
+    std::vector<uint8_t*> dst_ptrs;
+
+    for (size_t i = 0; i < num_src; i++) {
+        const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
+        const size_t num_channels = src_mem.GetDims()[channelAxis];
+
+        channelsDataSize.push_back(num_channels * dataSize);
+        src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
+        dst_ptrs.push_back(dst_ptr + channels_size);
+        channels_size += num_channels * dataSize;
+    }
+
+    const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channelsDataSize[0];
+
+    parallel_for(iter_count, [&](int i) {
+        const size_t dst_off = i * channels_size;
+        for (int j = 0; j < num_src; j++) {
+            cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channelsDataSize[j], channelsDataSize[j]);
+        }
+    });
+}
+
 REG_MKLDNN_PRIM_FOR(MKLDNNConcatNode, Concatenation);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
@ -30,8 +30,10 @@ public:

 private:
    size_t axis = 0;
+    bool canOptimizeNspc = false;

    size_t inverseOrder(const InferenceEngine::SizeVector& order, size_t axis);
+    void execNspcSpecCase();

    InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
    InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_shuffle_channels_node.cpp
@ -94,11 +94,15 @@ void MKLDNNShuffleChannelsNode::initSupportedPrimitiveDescriptors() {
        impl_type = impl_desc_type::ref;
    }

-    addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision}},
-                         {{TensorDescCreatorTypes::nspc, precision}},
+    // use ncsp as default for non-quantized networks and nspc for quantized
+    auto firstCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::nspc : TensorDescCreatorTypes::ncsp;
+    auto secondCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::ncsp : TensorDescCreatorTypes::nspc;
+
+    addSupportedPrimDesc({{firstCreatorType, precision}},
+                         {{firstCreatorType, precision}},
                         impl_type, supportDynamicBatch_);
-    addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}},
-                         {{TensorDescCreatorTypes::ncsp, precision}},
+    addSupportedPrimDesc({{secondCreatorType, precision}},
+                         {{secondCreatorType, precision}},
                         impl_type, supportDynamicBatch_);
    // canUseBlocked
    if (axis_ != 1) {
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/concat.cpp
@ -0,0 +1,214 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph_functions/builders.hpp"
+#include "test_utils/cpu_test_utils.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+
+typedef std::tuple<
+        size_t,                            // Concat axis
+        std::vector<std::vector<size_t>>,  // Input shapes
+        InferenceEngine::Precision,        // Network precision
+        std::string,                       // Device name
+        CPUSpecificParams
+> concatCPUTestParams;
+
+class ConcatLayerCPUTest : public testing::WithParamInterface<concatCPUTestParams>,
+                           virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<concatCPUTestParams> obj) {
+        int axis;
+        std::vector<std::vector<size_t>> inputShapes;
+        InferenceEngine::Precision netPrecision;
+        std::string targetName;
+        CPUSpecificParams cpuParams;
+        std::tie(axis, inputShapes, netPrecision, targetName, cpuParams) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+        result << "axis=" << axis << "_";
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "trgDev=" << targetName << "_";
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        int axis;
+        std::vector<std::vector<size_t>> inputShape;
+        InferenceEngine::Precision netPrecision;
+        CPUSpecificParams cpuParams;
+        std::tie(axis, inputShape, netPrecision, targetDevice, cpuParams) = this->GetParam();
+        inPrc = outPrc = netPrecision;
+
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+        selectedType += std::string("_") + inPrc.name();
+
+        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+        auto params = ngraph::builder::makeParams(ngPrc, inputShape);
+        auto paramOuts = ngraph::helpers::convert2OutputVector(
+                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+        auto concat = std::make_shared<ngraph::opset1::Concat>(paramOuts, axis);
+
+        function = makeNgraphFunction(ngPrc, params, concat, "concat");
+    }
+};
+
+TEST_P(ConcatLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+    CheckPluginRelatedResults(executableNetwork, "Concatenation");
+}
+
+namespace {
+const auto planar_4D_ref = CPUSpecificParams{{nchw}, {nchw}, {"ref"}, "ref"};
+const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"};
+
+const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"};
+const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"};
+
+const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
+const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};
+
+const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"};
+const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"};
+
+const auto blocked8_4D_ref = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "ref"};
+const auto blocked8_5D_ref = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "ref"};
+
+const auto blocked16_4D = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "unknown"};
+const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown"};
+
+const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"};
+const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"};
+
+// List of precisions natively supported by mkldnn.
+const std::vector<Precision> netPrecisions = {
+        Precision::I8,
+        Precision::I32,
+        Precision::FP32,
+        Precision::BF16
+};
+
+INSTANTIATE_TEST_CASE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{1, 8,  3, 5},
+                                                                                   {1, 16, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_4D, planarChannels_4D, blocked8_4D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block8, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
+                                                                                   {2, 16, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
+                                                                                   {2, 32, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_4D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5},
+                                                                                   {2, 32, 3, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_4D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{1, 8,  3, 5, 7},
+                                                                                   {1, 16, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_5D, planarChannels_5D, blocked8_5D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block8, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3, 4),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
+                                                                                   {2, 16, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
+                                                                                   {2, 32, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_5D)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2, 3, 4),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5, 7},
+                                                                                   {2, 32, 3, 5, 7}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(blocked16_5D_ref)),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat_inPlace, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(1),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 3, 5},
+                                                                                   {2, 4, 5}},
+                                                  std::vector<std::vector<size_t>>{{2, 3},
+                                                                                   {2, 4}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat3D, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0, 2),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 4, 5},
+                                                                                   {2, 4, 5}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Concat_1D_2D, ConcatLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Values(0),
+                                ::testing::Values(std::vector<std::vector<size_t>>{{2, 4},
+                                                                                   {3, 4}},
+                                                  std::vector<std::vector<size_t>>{{2}, {3}}),
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
+                        ConcatLayerCPUTest::getTestCaseName);
+
+} // namespace
+} // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/fuse_transpose_reorder.cpp
@ -222,6 +222,7 @@ void FuseTransposeAndReorderTest2::CreateGraph() {
    transpose2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});

    auto concat = ngraph::builder::makeConcat({transpose1, transpose2}, 1);
+    concat->get_rt_info() = makeCPUInfo({memFmt1, memFmt1}, {memFmt1}, {});

    ngraph::ResultVector results{std::make_shared<ngraph::opset5::Result>(concat)};
    function = std::make_shared<ngraph::Function>(results, params, "Transpose_Transpose_Concat");