From cf52ba5c0865b1657a4a365c2a1cfe99c3975565 Mon Sep 17 00:00:00 2001 From: Ivan Novoselov Date: Mon, 18 Oct 2021 18:31:23 +0300 Subject: [PATCH] [CPU] Bug/custom_inplace_reorders/fix unit (#7222) --- .../nodes/mkldnn_reorder_node.cpp | 70 +++-- .../mkldnn_plugin/nodes/mkldnn_reorder_node.h | 1 + .../tests/unit/cpu/CMakeLists.txt | 2 + .../unit/cpu/nodes/mkldnn_reorder_node.cpp | 262 ++++++++++++++++++ 4 files changed, 318 insertions(+), 17 deletions(-) create mode 100644 inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp index 1fdaf02315d..546f0afef67 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp @@ -71,7 +71,6 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() { if (isDynamic && (config.inConfs[0].desc->getShape().getRank() != config.outConfs[0].desc->getShape().getRank())) IE_THROW() << "Reorder node doesn't support case when input and output shapes have different rank and dynamic"; - if (!isOptimized) { const auto &inShape = getInputShapeAtPort(0); if (MKLDNNPlugin::one_of(inShape.getRank(), 4, 5) && @@ -88,7 +87,7 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() { config.inConfs[0].desc->getPrecision() == config.outConfs[0].desc->getPrecision() && config.inConfs[0].desc->getPrecision().size() == 1) { // oneDNN doesn't provide JIT reorder impl for non-avx2 targets so we fallback on simple c++ implementation which shows better perf - canUseNcsp2Nspc = true; + isNcsp2NspcCase = true; } } } @@ -116,14 +115,46 @@ void MKLDNNReorderNode::prepareParams() { if (getSelectedPrimitiveDescriptor() == nullptr) IE_THROW() << "Preferable primitive descriptor is not set."; - if (isNspc2NcspCase) { + auto isSupportedDesc = [](const MemoryDesc& desc) { + if (!desc.isDefined()) { + return false; + } + if (!(desc.getType() & MemoryDescType::Blocked)) { + return false; + } + if ((desc.getType() & MemoryDescType::Mkldnn) && !desc.as()->hasEmptyExtraData()) { + return false; + } + return true; + }; + + const auto& parentDesc = srcMemPtr->getDesc(); + const auto& childDesc = dstMemPtr->getDesc(); + if ((isNspc2NcspCase || isNcsp2NspcCase) && isSupportedDesc(childDesc) && isSupportedDesc(parentDesc)) { const auto &inDims = srcMemPtr->getStaticDims(); - canUseNspc2Ncsp = inDims[1] <= 64 && inDims[1] >= 16 && - (srcMemPtr->GetDescWithType()->getPaddedElementsCount() / inDims[1]) >= 128; + // Check that child strides are consistent with parent dims if the child is inplace. + // The strides must be dense except for the channel one (since the child num channels might differ) + const auto childSubBlocksAreDense = [&]() { + const auto& dstStrides = childDesc.as()->getStrides(); + const auto& dstOrder = childDesc.as()->getOrder(); + const size_t channelDim = 1; + if (dstStrides.back() != 1) + return false; + for (int i = inDims.size() - 1; i > 0; i--) { + if (dstStrides[i-1] != dstStrides[i] * inDims[dstOrder[i]] && dstOrder[i] != channelDim) + return false; + } + return true; + }; + if (isNspc2NcspCase) { + canUseNspc2Ncsp = inDims[1] <= 64 && inDims[1] >= 16 && + (parentDesc.as()->getPaddedElementsCount() / inDims[1]) >= 128 && + childSubBlocksAreDense(); + } else if (isNcsp2NspcCase) { + canUseNcsp2Nspc = childSubBlocksAreDense(); + } } if (!canUseNcsp2Nspc && !canUseNspc2Ncsp) { - auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr(); - auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr(); if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr()) IE_THROW() << "Destination memory didn't allocate."; if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr()) @@ -207,6 +238,7 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() { auto childEdge = getChildEdgeAt(0); auto inDims = parentEdge->getMemory().GetShape().getStaticDims(); + const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType()->getStrides(); const size_t ndims = inDims.size(); const size_t DIM0 = inDims[0]; const size_t DIM1 = inDims[1]; @@ -217,18 +249,20 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() { auto src_data = reinterpret_cast(parentEdge->getMemoryPtr()->GetPtr()); auto dst_data = reinterpret_cast(childEdge->getMemoryPtr()->GetPtr()); - const size_t stride0 = DIM1 * DIM2 * DIM3 * DIM4; + const size_t src_batch_stride = DIM1 * DIM2 * DIM3 * DIM4; + const size_t dst_batch_stride = dstStrides[0]; + const size_t dst_channel_stride = dstStrides[ndims-2]; const size_t stride1 = DIM2 * DIM3 * DIM4; const size_t stride2 = DIM2 * DIM3; parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) { - size_t src_off = dim0 * stride0 + j * DIM4 + dim1 * stride1; - size_t dst_off = dim0 * stride0 + j * DIM4 * DIM1 + dim1; + size_t src_off = dim0 * src_batch_stride + j * DIM4 + dim1 * stride1; + size_t dst_off = dim0 * dst_batch_stride + j * DIM4 * dst_channel_stride + dim1; for (size_t dim4 = 0; dim4 < DIM4; ++dim4) { dst_data[dst_off] = src_data[src_off]; src_off++; - dst_off += DIM1; + dst_off += dst_channel_stride; } }); } @@ -248,15 +282,17 @@ void MKLDNNReorderNode::optimizedNspc2Ncsp() { auto src_data = reinterpret_cast(parentEdge->getMemoryPtr()->GetPtr()); auto dst_data = reinterpret_cast(childEdge->getMemoryPtr()->GetPtr()); - const size_t stride1 = DIM2 * DIM3 * DIM4; - const size_t stride0 = stride1 * DIM1; - parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) { - auto src_off = b*stride0 + j*DIM1; - auto dst_off = b*stride0 + j; + const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType()->getStrides(); + const size_t block_size = DIM2 * DIM3 * DIM4; + const size_t src_batch_stride = block_size * DIM1; + const size_t dst_batch_stride = dstStrides[0]; + parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) { + auto src_off = b * src_batch_stride + j * DIM1; + auto dst_off = b * dst_batch_stride + j; for (size_t dim1 = 0; dim1 < DIM1; ++dim1) { dst_data[dst_off] = src_data[src_off]; src_off++; - dst_off += stride1; + dst_off += block_size; } }); } diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h index 95d51b93523..1ccb23768d3 100644 --- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h +++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h @@ -73,6 +73,7 @@ private: bool isOptimized = false; bool isNspc2NcspCase = false; + bool isNcsp2NspcCase = false; bool canUseNspc2Ncsp = false; bool canUseNcsp2Nspc = false; diff --git a/inference-engine/tests/unit/cpu/CMakeLists.txt b/inference-engine/tests/unit/cpu/CMakeLists.txt index 683c23d77a3..f90dd34bdd7 100644 --- a/inference-engine/tests/unit/cpu/CMakeLists.txt +++ b/inference-engine/tests/unit/cpu/CMakeLists.txt @@ -9,6 +9,8 @@ addIeTargetTest( ROOT ${CMAKE_CURRENT_SOURCE_DIR} INCLUDES ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin + ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin/nodes + $ OBJECT_FILES $ LINK_LIBRARIES diff --git a/inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp b/inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp new file mode 100644 index 00000000000..0d39ff48027 --- /dev/null +++ b/inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp @@ -0,0 +1,262 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "mkldnn_reorder_node.h" +#include "mkldnn_input_node.h" +#include "mkldnn_edge.h" +#include "mkldnn_node.h" +/* + * Test MKLDNNReorderNode::optimizedNcsp2Nspc() and MKLDNNReorderNode::optimizedNspc2Ncsp() for + * inPlace and non-inPlace cases. Specifically, the test checks that dst batch strides are + * correctly taken into account by the custom impls (the case when the reorder is followed by an inplace concat). + */ +typedef std::tuple< + std::vector, // srcDims + bool> // forceInplace; + ReorderCustomImplTestParamSet; + +class ReorderCustomImplTestBase: public ::testing::Test { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + std::vector srcDims; + bool inPlace; + std::tie(srcDims, inPlace) = obj.param; + std::ostringstream result; + result << "IS:("; + for (const auto s : srcDims) + result << s << "."; + result.seekp(-1, result.cur); + result << ")"; + result << "_InPlace:" << inPlace; + return result.str(); + } + +protected: + void executeReorderNode(const void* srcData, void* dstData) { + auto getBlockedDims = [](const std::vector& dims, const std::vector& order){ + std::vector result; + result.reserve(order.size()); + for (auto i : order) + result.push_back(dims[i]); + return result; + }; + auto getStrides = [](const std::vector& dims){ + std::vector result(dims.size()); + result[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; --i) { + result[i] = result[i+1] * dims[i+1]; + } + return result; + }; + const mkldnn::engine cpuEngine(dnnl::engine::kind::cpu, 0); + MKLDNNPlugin::MKLDNNWeightsSharing::Ptr weightsCache; + + auto inputNode = std::make_shared(MKLDNNPlugin::Shape(srcDims), + prec, + "Reorder_Input", "Input", + cpuEngine, weightsCache); + auto reorderNode = std::make_shared("Reorder", cpuEngine, weightsCache); + auto outputNode = std::make_shared(MKLDNNPlugin::Shape(dstDims), + prec, + "Reorder_Output", "Output", + cpuEngine, weightsCache); + + auto parentEdge = std::make_shared(inputNode, reorderNode, 0, 0); + auto childEdge = std::make_shared(reorderNode, outputNode, 0, 0); + parentEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation); + childEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation); + reorderNode->addEdge(parentEdge); + reorderNode->addEdge(childEdge); + + const std::vector srcBlockedDims = getBlockedDims(srcDims, srcOrder); + const std::vector srcStrides = getStrides(srcBlockedDims); + const std::vector offsetPaddingToData(srcDims.size(), 0); + + const std::vector dstBlockedDims = getBlockedDims(dstDims, dstOrder); + const std::vector dstStrides = getStrides(dstBlockedDims); + + const MKLDNNPlugin::CpuBlockedMemoryDesc inputDesc(prec, MKLDNNPlugin::Shape(srcDims), + srcBlockedDims, srcOrder, + 0, offsetPaddingToData, srcStrides); + + const MKLDNNPlugin::CpuBlockedMemoryDesc outputDesc(prec, MKLDNNPlugin::Shape(srcDims), + getBlockedDims(srcDims, dstOrder), dstOrder, + 0, offsetPaddingToData, dstStrides); + + auto parentMemory = std::make_shared(cpuEngine); + auto childMemory = std::make_shared(cpuEngine); + parentMemory->Create(inputDesc, srcData); + childMemory->Create(outputDesc, dstData); + parentEdge->reuse(parentMemory); + childEdge->reuse(childMemory); + + reorderNode->setDescs(inputDesc, outputDesc); + std::vector> nodes {inputNode, reorderNode, outputNode}; + for (auto &n : nodes) { + n->init(); + n->getSupportedDescriptors(); + n->initSupportedPrimitiveDescriptors(); + n->selectPrimitiveDescriptorByIndex(0); + } + auto config = outputNode->getSelectedPrimitiveDescriptor()->getConfig(); + config.inConfs.resize(1); + config.inConfs[0].inPlace = forceInplace ? 0 : -1; + outputNode->getSelectedPrimitiveDescriptor()->setConfig(config); + reorderNode->createPrimitive(); + + mkldnn::stream strm(cpuEngine); + reorderNode->execute(strm); + return; + } + + template + void Run(const std::vector& srcData, std::vector& dstData) { + fillData(); + executeReorderNode(srcData.data(), dstData.data()); + EXPECT_TRUE(resultIsCorrect(dstData)); + } + // Fill srcData so that the results of NSPC2NCSP and NCSP2NSPC reorders are incremental numbers 0,1,2,... + // Fill dstData with zeros + virtual void fillData() = 0; + template + bool resultIsCorrect(const std::vector& dstData) { + const size_t numElems = getNumElems(dstDims); + auto b = dstData.begin(); + std::vector expectedData(blockSize); + for (int i = 0; i < numElems / blockSize; i++, b += blockSize) { + if (i % 2 == 0) { + std::iota(expectedData.begin(), expectedData.end(), i / 2 * blockSize); + if (!std::equal(b, b + blockSize, expectedData.begin())) + return false; + } else if (!std::all_of(b, b + blockSize, [](T x){return x == 0;})) { + return false; + } + } + return true; + } + size_t getNumElems(const std::vector& dims) { + size_t result = 1; + for (auto d : dims) + result *= d; + return result; + } + std::vector srcDims; + std::vector srcOrder; + std::vector dstDims; + std::vector dstOrder; + InferenceEngine::Precision prec; + bool forceInplace; + size_t blockSize; +}; + +class ReorderNSPC2NCSPTest: public testing::WithParamInterface, + public ReorderCustomImplTestBase{ +protected: + void SetUp() override { + std::tie(srcDims, forceInplace) = this->GetParam(); + // The custom NSPC2NCSP impl is used only if an input shape complies with: + assert(srcDims[1] <= 64 && srcDims[1] >= 16 && (getNumElems(srcDims) / srcDims[1]) >= 128); + // The custom NSPC2NCSP impl is used only for FP32 + prec = InferenceEngine::Precision::FP32; + srcOrder = std::vector {0, 2, 3, 1}; + dstOrder = std::vector {0, 1, 2, 3}; + dstDims = srcDims; + blockSize = getNumElems(srcDims); + // Create channel-strided dst layout for the inPlace case + // Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly. + if (forceInplace) { + dstDims[1] *= 2; + blockSize /= srcDims[0]; + } + } + void Run() { + ReorderCustomImplTestBase::Run(srcData, dstData); + } + void fillData() override { + dstData.resize(getNumElems(dstDims)); + std::fill(dstData.begin(), dstData.end(), 0); + srcData.resize(getNumElems(srcDims)); + const int numChannels = srcDims[1]; + const int spBlockSize = srcDims[2] * srcDims[3]; + const int batchSize = spBlockSize * numChannels; + int i = 0; + for (int n = 0; n < getNumElems(srcDims); n += batchSize) { + for (int sp = n; sp < n + spBlockSize; sp++) { + for (int c = sp; c < sp + batchSize; c += spBlockSize) { + srcData[i++] = static_cast(c); + } + } + } + } + std::vector dstData; + std::vector srcData; +}; + +class ReorderNCSP2NSPCTest: public testing::WithParamInterface, + public ReorderCustomImplTestBase{ +protected: + void SetUp() override { + std::tie(srcDims, forceInplace) = this->GetParam(); + // Avoid uint8_t overflow or modify fillNCSP2NSPC() and resultIsCorrect() + assert(getNumElems(srcDims) <= 256); + srcOrder = std::vector {0, 1, 2, 3}; + dstOrder = std::vector {0, 2, 3, 1}; + // The custom NSPC2NCSP impl is used only for U8 + prec = InferenceEngine::Precision::U8; + dstDims = srcDims; + blockSize = getNumElems(srcDims); + // Create channel-strided dst layout for the inPlace case + // Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly. + if (forceInplace) { + dstDims[1] *= 2; + blockSize = srcDims[1]; + } + } + void Run() { + ReorderCustomImplTestBase::Run(srcData, dstData); + } + void fillData() override { + dstData.resize(getNumElems(dstDims)); + std::fill(dstData.begin(), dstData.end(), 0); + srcData.resize(getNumElems(srcDims)); + const int numChannels = srcDims[1]; + const int batchSize = srcDims[2] * srcDims[3] * numChannels; + int i = 0; + for (int n = 0; n < getNumElems(srcDims); n += batchSize) { + for (int c = n; c < n + numChannels; c ++) { + for (int sp = c; sp < c + batchSize; sp += numChannels) { + srcData[i++] = static_cast(sp); + } + } + } + } + std::vector dstData; + std::vector srcData; +}; + +TEST_P(ReorderNSPC2NCSPTest, NSPC2NCSP) { + Run(); +} + +TEST_P(ReorderNCSP2NSPCTest, NCSP2NSPC) { + Run(); +} + +const std::vector forceInplace {false, true}; +const auto NSPC2NCSPparams =::testing::Combine( + ::testing::Values(std::vector {2, 16, 8, 8}), + ::testing::ValuesIn(forceInplace)); + +INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNSPC, ReorderNSPC2NCSPTest, NSPC2NCSPparams, + ReorderCustomImplTestBase::getTestCaseName); + +const auto NCSP2NSPCparams =::testing::Combine( + ::testing::Values(std::vector {2, 8, 4, 4}), + ::testing::ValuesIn(forceInplace)); + +INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNCSP, ReorderNCSP2NSPCTest, NCSP2NSPCparams, + ReorderCustomImplTestBase::getTestCaseName); \ No newline at end of file