[CPU] Split supported PD initialization refactoring plus tests for multiply port connections. (#4521)

2021-04-26 15:18:26 +03:00 · 2021-04-26 15:18:26 +03:00 · 9f2a3d0edc
commit 9f2a3d0edc
parent ed5313b2e1
2 changed files with 92 additions and 100 deletions
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
@ -4,6 +4,7 @@

 #include "mkldnn_split_node.h"
 #include "common/cpu_memcpy.h"
+#include "common/tensor_desc_creator.h"
 #include <legacy/ie_layers.h>
 #include <vector>
 #include <mkldnn_types.h>
@ -16,47 +17,6 @@ using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;

-static TensorDesc makePlainTensorDesc(const Precision& precision, const SizeVector& srcDims) {
-    SizeVector order(srcDims.size());
-    std::iota(order.begin(), order.end(), 0);
-    return TensorDesc(precision, srcDims, {srcDims, order});
-}
-
-static TensorDesc makePerChannelTensorDesc(const Precision& precision, const SizeVector& srcDims) {
-    constexpr size_t channelsPos = 1lu;
-    SizeVector order(srcDims.size());
-    std::iota(order.begin(), order.end(), 0);
-    SizeVector blkDims = srcDims;
-    if (srcDims.size() > 2) {
-        auto moveElementBack = [](SizeVector& vector, size_t indx) {
-            auto itr = vector.begin() + indx;
-            std::rotate(itr, itr + 1, vector.end());
-        };
-
-        moveElementBack(order, channelsPos);
-        moveElementBack(blkDims, channelsPos);
-    }
-
-    return TensorDesc(precision, srcDims, {blkDims, order});
-}
-
-static TensorDesc makeChannelBlockedTensorDesc(const Precision& precision, const SizeVector& srcDims, size_t blockSize) {
-    if (srcDims.size() < 2) {
-        IE_THROW() << "Can't create blocked tensor descriptor!";
-    }
-
-    constexpr size_t channelsPos = 1lu;
-    SizeVector order(srcDims.size());
-    std::iota(order.begin(), order.end(), 0);
-    order.push_back(channelsPos);
-
-    SizeVector blkDims = srcDims;
-    blkDims[1] = blkDims[1] / blockSize + (blkDims[1] % blockSize ? 1 : 0);
-    blkDims.push_back(blockSize);
-
-    return TensorDesc(precision, srcDims, {blkDims, order});
-}
-
 MKLDNNSplitNode::MKLDNNSplitNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
        MKLDNNNode(layer, eng, cache) {}

@ -77,7 +37,6 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
 }

 void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
-    using TensorDescFactory = std::function<TensorDesc(const Precision&, const SizeVector&)>;
    constexpr size_t channelsPos = 1lu;

    if (!supportedPrimitiveDescriptors.empty())
@ -113,24 +72,49 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
    if (dstFirstDims.size() != srcDims.size())
        THROW_ERROR << "sizes of input blob and sum of output blobs are not equal.";

-
    InferenceEngine::Precision inpPrecision = inpData->getPrecision();
    auto outPrecision = inpPrecision; // the split layer doesn't convert precisions

-    // make primitive descriptor factory function for different configurations
    bool dynBatchSupport = true;
    if (axis < 1) {
        dynBatchSupport = false;
    }
-    auto makePdInfo = [dynBatchSupport](TensorDescFactory getTensorDesc, const Precision& precision,  const MKLDNNDims& srcDims,
-                                        const std::vector<MKLDNNDims>& outDims, impl_desc_type type) -> PrimitiveDescInfo {
+
+    //Set plain and tailC formats
+    std::vector<TensorDescCreatorTypes> tdCreatorTypes{ TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc };
+
+    //Support channel blocked format
+    if (srcDims.ndims() > 2) {
+        for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c) }) {
+            SizeVector blkDims = srcDims.ToSizeVector();
+            if (blkDims[channelsPos] % item.first)
+                continue;
+
+            bool blocked = true;
+            for (size_t i = 0; i < outDims.size(); i++) {
+                if (outDims[i].ToSizeVector()[channelsPos] % item.first) {
+                    blocked = false;
+                    break;
+                }
+            }
+            if (blocked) {
+                tdCreatorTypes.push_back(item.second);
+            }
+        }
+    }
+
+    std::vector<size_t> pdIndexesToReuse;
+
+    auto& creatorsMap = TensorDescCreator::getCommonCreators();
+    auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast<unsigned>(srcDims.ndims()), tdCreatorTypes);
+    for (auto itr = itrRange.first; itr != itrRange.second; ++itr) {
        InferenceEngine::LayerConfig config;

        config.dynBatchSupport = dynBatchSupport;
        config.inConfs.resize(1);
        config.inConfs[0].inPlace = -1;
        config.inConfs[0].constant = false;
-        config.inConfs[0].desc = getTensorDesc(precision, srcDims.ToSizeVector());
+        config.inConfs[0].desc = itr->second->createDesc(inpPrecision, srcDims.ToSizeVector());
        config.outConfs.resize(outDims.size());

        std::vector<memory::format_tag> outFormats;
@ -140,49 +124,22 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {

            config.outConfs[i].inPlace = -1;
            config.outConfs[i].constant = false;
-            config.outConfs[i].desc = getTensorDesc(precision, o_Dims.ToSizeVector());
+            config.outConfs[i].desc = itr->second->createDesc(inpPrecision, o_Dims.ToSizeVector());
            outFormats.push_back(MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat());
        }
-        return {config, type, outFormats};
-    };
+        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFormats);

-    //Set plain format
-    supportedPrimitiveDescriptors.push_back(makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref));
-
-    //Set per channel format.
-    supportedPrimitiveDescriptors.push_back(makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref));
-
-    //Support channel blocked format
-    std::vector<size_t> blockedPdIndexes;
-    if (srcDims.ndims() > channelsPos) {
-        for (size_t sizeS : {8lu, 16lu}) {
-            SizeVector blkDims = srcDims.ToSizeVector();
-            if (blkDims[channelsPos] % sizeS)
-                continue;
-
-            bool blocked = true;
-            for (size_t i = 0; i < outDims.size(); i++) {
-                if (outDims[i].ToSizeVector()[channelsPos] % sizeS) {
-                    blocked = false;
-                    break;
-                }
-            }
-            if (blocked) {
-                using std::placeholders::_1;
-                using std::placeholders::_2;
-                supportedPrimitiveDescriptors.push_back(makePdInfo(std::bind(&makeChannelBlockedTensorDesc, _1, _2, sizeS),
-                                                                   inpPrecision, srcDims, outDims, impl_desc_type::ref));
-                blockedPdIndexes.push_back(supportedPrimitiveDescriptors.size() - 1);
+        if (itr->first == TensorDescCreatorTypes::ncsp) {
+            // at least the plain layout can be optimized inplace.
+            pdIndexesToReuse.emplace_back(supportedPrimitiveDescriptors.size() - 1);
+        } else if (itr->first == TensorDescCreatorTypes::nCsp8c || itr->first == TensorDescCreatorTypes::nCsp16c) {
+            if (axis < 2) {
+                pdIndexesToReuse.emplace_back(supportedPrimitiveDescriptors.size() - 1);
            }
        }
    }

    // Optimized inplace case
-    std::vector<size_t> pdIndexesToReuse(1, 0); // at least the first plain layout can be optimized inplace.
-    if (axis < 2) {
-        pdIndexesToReuse.insert(pdIndexesToReuse.end(), blockedPdIndexes.begin(), blockedPdIndexes.end());
-    }
-
    for (auto refPdIndex : pdIndexesToReuse) {
        const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig();
        auto config = refConfig;
@ -220,12 +177,26 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {

    // Special nspc -> ncsp case when splitting channels
    if (axis == 1 && (dstFirstDims.ndims() == 4 || dstFirstDims.ndims() == 5)) {
-        auto plain = makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
-        auto perChannel = makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
+        InferenceEngine::LayerConfig config;

-        plain.getConfig().inConfs[0].desc = perChannel.getConfig().inConfs[0].desc;
+        config.dynBatchSupport = dynBatchSupport;
+        config.inConfs.resize(1);
+        config.inConfs[0].inPlace = -1;
+        config.inConfs[0].constant = false;
+        config.inConfs[0].desc = creatorsMap.at(TensorDescCreatorTypes::nspc)->createDesc(inpPrecision, srcDims.ToSizeVector());
+        config.outConfs.resize(outDims.size());

-        supportedPrimitiveDescriptors.push_back(plain);
+        std::vector<memory::format_tag> outFormats;
+
+        for (size_t i = 0; i < outDims.size(); i++) {
+            auto o_Dims = outDims[i];
+
+            config.outConfs[i].inPlace = -1;
+            config.outConfs[i].constant = false;
+            config.outConfs[i].desc = creatorsMap.at(TensorDescCreatorTypes::ncsp)->createDesc(inpPrecision, o_Dims.ToSizeVector());
+            outFormats.push_back(MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat());
+        }
+        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFormats);
    }
 }

@ -362,9 +333,19 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() {
 }

 void MKLDNNSplitNode::selectOptimalPrimitiveDescriptor() {
-    if (implPriorities.size() > 0 && implPriorities[0] == impl_desc_type::ref) {
-        selectPrimitiveDescriptorByIndex(0);
-        return;
+    // Enforce the reference implementation for the planar layout if the implementation is in the impl priorities list.
+    // This is needed mostly for the testing purposes, since for the planar layout Split works always in place, we need to enforce
+    // the reference implementation when it is selected in a test to test that piece of code.
+    if (!implPriorities.empty() && implPriorities[0] == impl_desc_type::ref) {
+        auto plain = PartialBlkDesc::makePlain(getParentEdgeAt(0)->getDims().ToSizeVector());
+        for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) {
+            auto& pd = supportedPrimitiveDescriptors[i];
+            if (PartialBlkDesc::extractFrom(pd.getConfig().inConfs[0].desc) == plain &&
+                impl_desc_type::ref == pd.getImplementationType()) {
+                    selectPrimitiveDescriptorByIndex(static_cast<int>(i));
+                return;
+            }
+        }
    }

    //check the descriptors and select the ones that have the same data format as the input
@ -504,6 +485,7 @@ void MKLDNNSplitNode::prepareOptimizedParams() {
        optimizedParams.srcDataOffsets[i] = optimizedParams.srcDataOffsets[i - 1] + optimizedParams.dataSize[i - 1];
    }
 }
+
 void MKLDNNSplitNode::optimizedNspc2Ncsp(size_t MB) {
    auto parentEdge = getParentEdgeAt(0);
    const int ndims = parentEdge->getDims().ndims();
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/split.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/split.cpp
@ -69,8 +69,14 @@ protected:
        auto split = std::dynamic_pointer_cast<ngraph::opset5::Split>(ngraph::builder::makeSplit(paramOuts[0],
                                                                                                 ngPrc, numSplits, axis));
        ngraph::ResultVector results;
+
        for (int i = 0; i < outIndices.size(); i++) {
-            results.push_back(std::make_shared<ngraph::opset5::Result>(split->output(outIndices[i])));
+            // This WA is necessary because result nodes connected to the same output of the split node (or any node) are deduplicated
+            // on the CNNNetwork level. It might not be needed when the CPU plugin moves completely to nGraph.
+            // This is still a single layer test since the Ceiling nodes are added only as a WA.
+
+            auto fakeMultiplication = std::make_shared<ngraph::opset5::Ceiling>(split->output(outIndices[i]));
+            results.push_back(std::make_shared<ngraph::opset5::Result>(fakeMultiplication));
        }
        split->get_rt_info() = getCPUInfo();
        function = std::make_shared<ngraph::Function>(results, params, "split");
@ -117,13 +123,17 @@ const std::vector<Precision> netPrecisions = {
        Precision::BF16
 };

+const std::vector<std::vector<size_t>> outIndices3 = {{0, 1, 2}, {0, 1, 1, 0, 2}, {0, 0, 0, 2}};
+const std::vector<std::vector<size_t>> outIndices4 = {{0, 1, 2, 3}, {0, 1, 1, 0, 2, 3}, {0, 0, 0, 2, 3}};
+
+
 INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
                        ::testing::Combine(
                                ::testing::Values(4),
                                ::testing::Values(1),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({3, 28, 24, 9})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices4),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(perChannelsToPlanar_4D)),
                        SplitLayerCPUTest::getTestCaseName);
@ -134,7 +144,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
                                ::testing::Values(1),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({3, 21, 24, 9, 15})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices3),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(perChannelsToPlanar_5D)),
                        SplitLayerCPUTest::getTestCaseName);
@ -145,7 +155,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8inPlace, SplitLayerCPUTest,
                            ::testing::Values(0, 1),
                            ::testing::ValuesIn(netPrecisions),
                            ::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
-                            ::testing::Values(std::vector<size_t>({})),
+                            ::testing::ValuesIn(outIndices3),
                            ::testing::Values(CommonTestUtils::DEVICE_CPU),
                            ::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D)),
                    SplitLayerCPUTest::getTestCaseName);
@ -156,7 +166,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8, SplitLayerCPUTest,
                                ::testing::Values(2, 3),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices3),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D_ref)),
                        SplitLayerCPUTest::getTestCaseName);
@ -167,7 +177,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16inPlace, SplitLayerCPUTest,
                                ::testing::Values(0, 1),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({4, 64, 32, 12})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices3),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(blocked16_4D)),
                        SplitLayerCPUTest::getTestCaseName);
@ -178,7 +188,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16, SplitLayerCPUTest,
                                ::testing::Values(2, 3),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({4, 64, 32, 12})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices4),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(blocked16_4D_ref)),
                        SplitLayerCPUTest::getTestCaseName);
@ -189,7 +199,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8inPlace, SplitLayerCPUTest,
                                ::testing::Values(0, 1),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices3),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D)),
                        SplitLayerCPUTest::getTestCaseName);
@ -200,7 +210,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8, SplitLayerCPUTest,
                                ::testing::Values(2, 3, 4),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices3),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D_ref)),
                        SplitLayerCPUTest::getTestCaseName);
@ -211,7 +221,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16inPlace, SplitLayerCPUTest,
                                ::testing::Values(0, 1),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({4, 64, 32, 12, 20})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices4),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(blocked16_5D)),
                        SplitLayerCPUTest::getTestCaseName);
@ -222,7 +232,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16, SplitLayerCPUTest,
                                ::testing::Values(2, 3, 4),
                                ::testing::ValuesIn(netPrecisions),
                                ::testing::Values(std::vector<size_t>({4, 64, 32, 12, 20})),
-                                ::testing::Values(std::vector<size_t>({})),
+                                ::testing::ValuesIn(outIndices4),
                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
                                ::testing::Values(blocked16_5D_ref)),
                        SplitLayerCPUTest::getTestCaseName);