From cf52ba5c0865b1657a4a365c2a1cfe99c3975565 Mon Sep 17 00:00:00 2001
From: Ivan Novoselov <ivan.novoselov@intel.com>
Date: Mon, 18 Oct 2021 18:31:23 +0300
Subject: [PATCH] [CPU] Bug/custom_inplace_reorders/fix unit (#7222)

---
 .../nodes/mkldnn_reorder_node.cpp             |  70 +++--
 .../mkldnn_plugin/nodes/mkldnn_reorder_node.h |   1 +
 .../tests/unit/cpu/CMakeLists.txt             |   2 +
 .../unit/cpu/nodes/mkldnn_reorder_node.cpp    | 262 ++++++++++++++++++
 4 files changed, 318 insertions(+), 17 deletions(-)
 create mode 100644 inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
index 1fdaf02315d..546f0afef67 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
@@ -71,7 +71,6 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
 
     if (isDynamic && (config.inConfs[0].desc->getShape().getRank() != config.outConfs[0].desc->getShape().getRank()))
         IE_THROW() << "Reorder node doesn't support case when input and output shapes have different rank and dynamic";
-
     if (!isOptimized) {
         const auto &inShape = getInputShapeAtPort(0);
         if (MKLDNNPlugin::one_of(inShape.getRank(), 4, 5) &&
@@ -88,7 +87,7 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
                    config.inConfs[0].desc->getPrecision() == config.outConfs[0].desc->getPrecision() &&
                    config.inConfs[0].desc->getPrecision().size() == 1) {
             // oneDNN doesn't provide JIT reorder impl for non-avx2 targets so we fallback on simple c++ implementation which shows better perf
-            canUseNcsp2Nspc = true;
+            isNcsp2NspcCase = true;
         }
     }
 }
@@ -116,14 +115,46 @@ void MKLDNNReorderNode::prepareParams() {
         if (getSelectedPrimitiveDescriptor() == nullptr)
             IE_THROW() << "Preferable primitive descriptor is not set.";
 
-        if (isNspc2NcspCase) {
+        auto isSupportedDesc = [](const MemoryDesc& desc) {
+            if (!desc.isDefined()) {
+                return false;
+            }
+            if (!(desc.getType() & MemoryDescType::Blocked)) {
+                return false;
+            }
+            if ((desc.getType() & MemoryDescType::Mkldnn) && !desc.as<const DnnlMemoryDesc>()->hasEmptyExtraData()) {
+                return false;
+            }
+            return true;
+        };
+
+        const auto&  parentDesc = srcMemPtr->getDesc();
+        const auto&  childDesc = dstMemPtr->getDesc();
+        if ((isNspc2NcspCase || isNcsp2NspcCase) && isSupportedDesc(childDesc) && isSupportedDesc(parentDesc)) {
             const auto &inDims = srcMemPtr->getStaticDims();
-            canUseNspc2Ncsp = inDims[1] <= 64 && inDims[1] >= 16 &&
-                              (srcMemPtr->GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount() / inDims[1]) >= 128;
+            // Check that child strides are consistent with parent dims if the child is inplace.
+            // The strides must be dense except for the channel one (since the child num channels might differ)
+            const auto childSubBlocksAreDense = [&]() {
+                const auto& dstStrides = childDesc.as<BlockedMemoryDesc>()->getStrides();
+                const auto& dstOrder = childDesc.as<BlockedMemoryDesc>()->getOrder();
+                const size_t channelDim = 1;
+                if (dstStrides.back() != 1)
+                    return false;
+                for (int i = inDims.size() - 1; i > 0; i--) {
+                    if (dstStrides[i-1] != dstStrides[i] * inDims[dstOrder[i]] && dstOrder[i] != channelDim)
+                        return false;
+                }
+                return true;
+            };
+            if (isNspc2NcspCase) {
+                canUseNspc2Ncsp = inDims[1] <= 64 && inDims[1] >= 16 &&
+                                  (parentDesc.as<BlockedMemoryDesc>()->getPaddedElementsCount() / inDims[1]) >= 128 &&
+                                  childSubBlocksAreDense();
+            } else if (isNcsp2NspcCase) {
+                canUseNcsp2Nspc = childSubBlocksAreDense();
+            }
         }
         if (!canUseNcsp2Nspc && !canUseNspc2Ncsp) {
-            auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-            auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
             if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
                 IE_THROW() << "Destination memory didn't allocate.";
             if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
@@ -207,6 +238,7 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() {
     auto childEdge = getChildEdgeAt(0);
 
     auto inDims = parentEdge->getMemory().GetShape().getStaticDims();
+    const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType<BlockedMemoryDesc>()->getStrides();
     const size_t ndims = inDims.size();
     const size_t DIM0 = inDims[0];
     const size_t DIM1 = inDims[1];
@@ -217,18 +249,20 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() {
     auto src_data = reinterpret_cast<const uint8_t *>(parentEdge->getMemoryPtr()->GetPtr());
     auto dst_data = reinterpret_cast<uint8_t *>(childEdge->getMemoryPtr()->GetPtr());
 
-    const size_t stride0 = DIM1 * DIM2 * DIM3 * DIM4;
+    const size_t src_batch_stride = DIM1 * DIM2 * DIM3 * DIM4;
+    const size_t dst_batch_stride = dstStrides[0];
+    const size_t dst_channel_stride = dstStrides[ndims-2];
     const size_t stride1 = DIM2 * DIM3 * DIM4;
     const size_t stride2 = DIM2 * DIM3;
 
     parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) {
-        size_t src_off = dim0 * stride0 + j * DIM4 + dim1 * stride1;
-        size_t dst_off = dim0 * stride0 + j * DIM4 * DIM1 + dim1;
+        size_t src_off = dim0 * src_batch_stride + j * DIM4 + dim1 * stride1;
+        size_t dst_off = dim0 * dst_batch_stride + j * DIM4 * dst_channel_stride + dim1;
 
         for (size_t dim4 = 0; dim4 < DIM4; ++dim4) {
             dst_data[dst_off] = src_data[src_off];
             src_off++;
-            dst_off += DIM1;
+            dst_off += dst_channel_stride;
         }
     });
 }
@@ -248,15 +282,17 @@ void MKLDNNReorderNode::optimizedNspc2Ncsp() {
     auto src_data = reinterpret_cast<const float *>(parentEdge->getMemoryPtr()->GetPtr());
     auto dst_data = reinterpret_cast<float *>(childEdge->getMemoryPtr()->GetPtr());
 
-    const size_t stride1 = DIM2 * DIM3 * DIM4;
-    const size_t stride0 = stride1 * DIM1;
-    parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
-        auto src_off = b*stride0 + j*DIM1;
-        auto dst_off = b*stride0 + j;
+    const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType<BlockedMemoryDesc>()->getStrides();
+    const size_t block_size = DIM2 * DIM3 * DIM4;
+    const size_t src_batch_stride = block_size * DIM1;
+    const size_t dst_batch_stride = dstStrides[0];
+    parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) {
+        auto src_off = b * src_batch_stride + j * DIM1;
+        auto dst_off = b * dst_batch_stride + j;
         for (size_t dim1 = 0; dim1 < DIM1; ++dim1) {
             dst_data[dst_off] = src_data[src_off];
             src_off++;
-            dst_off += stride1;
+            dst_off += block_size;
         }
     });
 }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
index 95d51b93523..1ccb23768d3 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
@@ -73,6 +73,7 @@ private:
     bool isOptimized = false;
 
     bool isNspc2NcspCase = false;
+    bool isNcsp2NspcCase = false;
     bool canUseNspc2Ncsp = false;
     bool canUseNcsp2Nspc = false;
 
diff --git a/inference-engine/tests/unit/cpu/CMakeLists.txt b/inference-engine/tests/unit/cpu/CMakeLists.txt
index 683c23d77a3..f90dd34bdd7 100644
--- a/inference-engine/tests/unit/cpu/CMakeLists.txt
+++ b/inference-engine/tests/unit/cpu/CMakeLists.txt
@@ -9,6 +9,8 @@ addIeTargetTest(
         ROOT ${CMAKE_CURRENT_SOURCE_DIR}
         INCLUDES
             ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin
+            ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin/nodes
+            $<TARGET_PROPERTY:openvino::conditional_compilation,INTERFACE_INCLUDE_DIRECTORIES>
         OBJECT_FILES
             $<TARGET_OBJECTS:MKLDNNPlugin_obj>
         LINK_LIBRARIES
diff --git a/inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp b/inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp
new file mode 100644
index 00000000000..0d39ff48027
--- /dev/null
+++ b/inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp
@@ -0,0 +1,262 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <ie_common.h>
+
+#include "mkldnn_reorder_node.h"
+#include "mkldnn_input_node.h"
+#include "mkldnn_edge.h"
+#include "mkldnn_node.h"
+/*
+ * Test MKLDNNReorderNode::optimizedNcsp2Nspc() and MKLDNNReorderNode::optimizedNspc2Ncsp() for
+ * inPlace and non-inPlace cases. Specifically, the test checks that dst batch strides are
+ * correctly taken into account by the custom impls (the case when the reorder is followed by an inplace concat).
+ */
+typedef std::tuple<
+        std::vector<size_t>, // srcDims
+        bool>                // forceInplace;
+        ReorderCustomImplTestParamSet;
+
+class ReorderCustomImplTestBase: public ::testing::Test {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<ReorderCustomImplTestParamSet> &obj) {
+        std::vector<size_t> srcDims;
+        bool inPlace;
+        std::tie(srcDims, inPlace) = obj.param;
+        std::ostringstream result;
+        result << "IS:(";
+        for (const auto s : srcDims)
+            result << s << ".";
+        result.seekp(-1, result.cur);
+        result << ")";
+        result << "_InPlace:" << inPlace;
+        return result.str();
+    }
+
+protected:
+    void executeReorderNode(const void* srcData, void* dstData) {
+        auto getBlockedDims = [](const std::vector<size_t>& dims, const std::vector<size_t>& order){
+            std::vector<size_t> result;
+            result.reserve(order.size());
+            for (auto i : order)
+                result.push_back(dims[i]);
+            return result;
+        };
+        auto getStrides = [](const std::vector<size_t>& dims){
+            std::vector<size_t> result(dims.size());
+            result[dims.size() - 1] = 1;
+            for (int i = dims.size() - 2; i >= 0; --i) {
+                result[i] = result[i+1] * dims[i+1];
+            }
+            return result;
+        };
+        const mkldnn::engine cpuEngine(dnnl::engine::kind::cpu, 0);
+        MKLDNNPlugin::MKLDNNWeightsSharing::Ptr weightsCache;
+
+        auto inputNode = std::make_shared<MKLDNNPlugin::MKLDNNInputNode>(MKLDNNPlugin::Shape(srcDims),
+                                                                         prec,
+                                                                         "Reorder_Input", "Input",
+                                                                         cpuEngine, weightsCache);
+        auto reorderNode = std::make_shared<MKLDNNPlugin::MKLDNNReorderNode>("Reorder", cpuEngine, weightsCache);
+        auto outputNode = std::make_shared<MKLDNNPlugin::MKLDNNInputNode>(MKLDNNPlugin::Shape(dstDims),
+                                                                          prec,
+                                                                          "Reorder_Output", "Output",
+                                                                          cpuEngine, weightsCache);
+
+        auto parentEdge = std::make_shared<MKLDNNPlugin::MKLDNNEdge>(inputNode, reorderNode, 0, 0);
+        auto childEdge = std::make_shared<MKLDNNPlugin::MKLDNNEdge>(reorderNode, outputNode, 0, 0);
+        parentEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation);
+        childEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation);
+        reorderNode->addEdge(parentEdge);
+        reorderNode->addEdge(childEdge);
+
+        const std::vector<size_t> srcBlockedDims = getBlockedDims(srcDims, srcOrder);
+        const std::vector<size_t> srcStrides = getStrides(srcBlockedDims);
+        const std::vector<size_t> offsetPaddingToData(srcDims.size(), 0);
+
+        const std::vector<size_t> dstBlockedDims = getBlockedDims(dstDims, dstOrder);
+        const std::vector<size_t> dstStrides = getStrides(dstBlockedDims);
+
+        const MKLDNNPlugin::CpuBlockedMemoryDesc inputDesc(prec, MKLDNNPlugin::Shape(srcDims),
+                                                           srcBlockedDims, srcOrder,
+                                                           0, offsetPaddingToData, srcStrides);
+
+        const MKLDNNPlugin::CpuBlockedMemoryDesc outputDesc(prec, MKLDNNPlugin::Shape(srcDims),
+                                                            getBlockedDims(srcDims, dstOrder), dstOrder,
+                                                            0, offsetPaddingToData, dstStrides);
+
+        auto parentMemory = std::make_shared<MKLDNNPlugin::MKLDNNMemory>(cpuEngine);
+        auto childMemory = std::make_shared<MKLDNNPlugin::MKLDNNMemory>(cpuEngine);
+        parentMemory->Create(inputDesc, srcData);
+        childMemory->Create(outputDesc, dstData);
+        parentEdge->reuse(parentMemory);
+        childEdge->reuse(childMemory);
+
+        reorderNode->setDescs(inputDesc, outputDesc);
+        std::vector<std::shared_ptr<MKLDNNPlugin::MKLDNNNode>> nodes {inputNode, reorderNode, outputNode};
+        for (auto &n : nodes) {
+            n->init();
+            n->getSupportedDescriptors();
+            n->initSupportedPrimitiveDescriptors();
+            n->selectPrimitiveDescriptorByIndex(0);
+        }
+        auto config = outputNode->getSelectedPrimitiveDescriptor()->getConfig();
+        config.inConfs.resize(1);
+        config.inConfs[0].inPlace = forceInplace ? 0 : -1;
+        outputNode->getSelectedPrimitiveDescriptor()->setConfig(config);
+        reorderNode->createPrimitive();
+
+        mkldnn::stream strm(cpuEngine);
+        reorderNode->execute(strm);
+        return;
+    }
+
+    template<typename T>
+    void Run(const std::vector<T>& srcData, std::vector<T>& dstData) {
+        fillData();
+        executeReorderNode(srcData.data(), dstData.data());
+        EXPECT_TRUE(resultIsCorrect(dstData));
+    }
+    // Fill srcData so that the results of NSPC2NCSP and NCSP2NSPC reorders are incremental numbers 0,1,2,...
+    // Fill dstData with zeros
+    virtual void fillData() = 0;
+    template<typename T>
+    bool resultIsCorrect(const std::vector<T>& dstData) {
+        const size_t numElems = getNumElems(dstDims);
+        auto b = dstData.begin();
+        std::vector<T> expectedData(blockSize);
+        for (int i = 0; i < numElems / blockSize; i++, b += blockSize) {
+            if (i % 2 == 0) {
+                std::iota(expectedData.begin(), expectedData.end(), i / 2 * blockSize);
+                if (!std::equal(b, b + blockSize, expectedData.begin()))
+                    return false;
+            } else if (!std::all_of(b, b + blockSize, [](T x){return x == 0;})) {
+                return false;
+            }
+        }
+        return true;
+    }
+    size_t getNumElems(const std::vector<size_t>& dims) {
+        size_t result = 1;
+        for (auto d : dims)
+            result *= d;
+        return result;
+    }
+    std::vector<size_t> srcDims;
+    std::vector<size_t> srcOrder;
+    std::vector<size_t> dstDims;
+    std::vector<size_t> dstOrder;
+    InferenceEngine::Precision prec;
+    bool forceInplace;
+    size_t blockSize;
+};
+
+class ReorderNSPC2NCSPTest: public testing::WithParamInterface<ReorderCustomImplTestParamSet>,
+                            public ReorderCustomImplTestBase{
+protected:
+    void SetUp() override {
+        std::tie(srcDims, forceInplace) = this->GetParam();
+        // The custom NSPC2NCSP  impl is used only if an input shape complies with:
+        assert(srcDims[1] <= 64 &&  srcDims[1] >= 16 &&   (getNumElems(srcDims) / srcDims[1]) >= 128);
+        // The custom NSPC2NCSP  impl is used only for FP32
+        prec = InferenceEngine::Precision::FP32;
+        srcOrder = std::vector<size_t> {0, 2, 3, 1};
+        dstOrder = std::vector<size_t> {0, 1, 2, 3};
+        dstDims = srcDims;
+        blockSize = getNumElems(srcDims);
+        // Create channel-strided dst layout for the inPlace case
+        // Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly.
+        if (forceInplace) {
+            dstDims[1] *= 2;
+            blockSize /=  srcDims[0];
+        }
+    }
+    void Run() {
+        ReorderCustomImplTestBase::Run(srcData, dstData);
+    }
+    void fillData() override {
+        dstData.resize(getNumElems(dstDims));
+        std::fill(dstData.begin(), dstData.end(), 0);
+        srcData.resize(getNumElems(srcDims));
+        const int numChannels = srcDims[1];
+        const int spBlockSize = srcDims[2] * srcDims[3];
+        const int batchSize = spBlockSize * numChannels;
+        int i = 0;
+        for (int n = 0; n < getNumElems(srcDims); n += batchSize) {
+            for (int sp = n; sp < n + spBlockSize; sp++) {
+                for (int c = sp; c < sp + batchSize; c += spBlockSize) {
+                    srcData[i++] = static_cast<float>(c);
+                }
+            }
+        }
+    }
+    std::vector<float> dstData;
+    std::vector<float> srcData;
+};
+
+class ReorderNCSP2NSPCTest: public testing::WithParamInterface<ReorderCustomImplTestParamSet>,
+                            public ReorderCustomImplTestBase{
+protected:
+    void SetUp() override {
+        std::tie(srcDims, forceInplace) = this->GetParam();
+        // Avoid uint8_t overflow or modify fillNCSP2NSPC() and resultIsCorrect()
+        assert(getNumElems(srcDims) <= 256);
+        srcOrder = std::vector<size_t> {0, 1, 2, 3};
+        dstOrder = std::vector<size_t> {0, 2, 3, 1};
+        // The custom NSPC2NCSP  impl is used only for U8
+        prec = InferenceEngine::Precision::U8;
+        dstDims = srcDims;
+        blockSize = getNumElems(srcDims);
+        // Create channel-strided dst layout for the inPlace case
+        // Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly.
+        if (forceInplace) {
+            dstDims[1] *= 2;
+            blockSize = srcDims[1];
+        }
+    }
+    void Run() {
+        ReorderCustomImplTestBase::Run(srcData, dstData);
+    }
+    void fillData() override {
+        dstData.resize(getNumElems(dstDims));
+        std::fill(dstData.begin(), dstData.end(), 0);
+        srcData.resize(getNumElems(srcDims));
+        const int numChannels = srcDims[1];
+        const int batchSize = srcDims[2] * srcDims[3] * numChannels;
+        int i = 0;
+        for (int n = 0; n < getNumElems(srcDims); n += batchSize) {
+            for (int c = n; c < n + numChannels; c ++) {
+                for (int sp = c; sp < c + batchSize; sp += numChannels) {
+                    srcData[i++] = static_cast<uint8_t>(sp);
+                }
+            }
+        }
+    }
+    std::vector<uint8_t> dstData;
+    std::vector<uint8_t> srcData;
+};
+
+TEST_P(ReorderNSPC2NCSPTest, NSPC2NCSP) {
+    Run();
+}
+
+TEST_P(ReorderNCSP2NSPCTest, NCSP2NSPC) {
+    Run();
+}
+
+const std::vector<bool> forceInplace {false, true};
+const auto NSPC2NCSPparams =::testing::Combine(
+                ::testing::Values(std::vector<size_t> {2, 16, 8, 8}),
+                ::testing::ValuesIn(forceInplace));
+
+INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNSPC, ReorderNSPC2NCSPTest, NSPC2NCSPparams,
+                         ReorderCustomImplTestBase::getTestCaseName);
+
+const auto NCSP2NSPCparams =::testing::Combine(
+        ::testing::Values(std::vector<size_t> {2, 8, 4, 4}),
+        ::testing::ValuesIn(forceInplace));
+
+INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNCSP, ReorderNCSP2NSPCTest, NCSP2NSPCparams,
+                         ReorderCustomImplTestBase::getTestCaseName);
\ No newline at end of file