[CPU] Bug/custom_inplace_reorders/fix unit (#7222)
This commit is contained in:
parent
ac9c2f19dc
commit
cf52ba5c08
@ -71,7 +71,6 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
if (isDynamic && (config.inConfs[0].desc->getShape().getRank() != config.outConfs[0].desc->getShape().getRank()))
|
||||
IE_THROW() << "Reorder node doesn't support case when input and output shapes have different rank and dynamic";
|
||||
|
||||
if (!isOptimized) {
|
||||
const auto &inShape = getInputShapeAtPort(0);
|
||||
if (MKLDNNPlugin::one_of(inShape.getRank(), 4, 5) &&
|
||||
@ -88,7 +87,7 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
|
||||
config.inConfs[0].desc->getPrecision() == config.outConfs[0].desc->getPrecision() &&
|
||||
config.inConfs[0].desc->getPrecision().size() == 1) {
|
||||
// oneDNN doesn't provide JIT reorder impl for non-avx2 targets so we fallback on simple c++ implementation which shows better perf
|
||||
canUseNcsp2Nspc = true;
|
||||
isNcsp2NspcCase = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -116,14 +115,46 @@ void MKLDNNReorderNode::prepareParams() {
|
||||
if (getSelectedPrimitiveDescriptor() == nullptr)
|
||||
IE_THROW() << "Preferable primitive descriptor is not set.";
|
||||
|
||||
if (isNspc2NcspCase) {
|
||||
auto isSupportedDesc = [](const MemoryDesc& desc) {
|
||||
if (!desc.isDefined()) {
|
||||
return false;
|
||||
}
|
||||
if (!(desc.getType() & MemoryDescType::Blocked)) {
|
||||
return false;
|
||||
}
|
||||
if ((desc.getType() & MemoryDescType::Mkldnn) && !desc.as<const DnnlMemoryDesc>()->hasEmptyExtraData()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
const auto& parentDesc = srcMemPtr->getDesc();
|
||||
const auto& childDesc = dstMemPtr->getDesc();
|
||||
if ((isNspc2NcspCase || isNcsp2NspcCase) && isSupportedDesc(childDesc) && isSupportedDesc(parentDesc)) {
|
||||
const auto &inDims = srcMemPtr->getStaticDims();
|
||||
// Check that child strides are consistent with parent dims if the child is inplace.
|
||||
// The strides must be dense except for the channel one (since the child num channels might differ)
|
||||
const auto childSubBlocksAreDense = [&]() {
|
||||
const auto& dstStrides = childDesc.as<BlockedMemoryDesc>()->getStrides();
|
||||
const auto& dstOrder = childDesc.as<BlockedMemoryDesc>()->getOrder();
|
||||
const size_t channelDim = 1;
|
||||
if (dstStrides.back() != 1)
|
||||
return false;
|
||||
for (int i = inDims.size() - 1; i > 0; i--) {
|
||||
if (dstStrides[i-1] != dstStrides[i] * inDims[dstOrder[i]] && dstOrder[i] != channelDim)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
if (isNspc2NcspCase) {
|
||||
canUseNspc2Ncsp = inDims[1] <= 64 && inDims[1] >= 16 &&
|
||||
(srcMemPtr->GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount() / inDims[1]) >= 128;
|
||||
(parentDesc.as<BlockedMemoryDesc>()->getPaddedElementsCount() / inDims[1]) >= 128 &&
|
||||
childSubBlocksAreDense();
|
||||
} else if (isNcsp2NspcCase) {
|
||||
canUseNcsp2Nspc = childSubBlocksAreDense();
|
||||
}
|
||||
}
|
||||
if (!canUseNcsp2Nspc && !canUseNspc2Ncsp) {
|
||||
auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
|
||||
auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
|
||||
if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
|
||||
IE_THROW() << "Destination memory didn't allocate.";
|
||||
if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
|
||||
@ -207,6 +238,7 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() {
|
||||
auto childEdge = getChildEdgeAt(0);
|
||||
|
||||
auto inDims = parentEdge->getMemory().GetShape().getStaticDims();
|
||||
const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType<BlockedMemoryDesc>()->getStrides();
|
||||
const size_t ndims = inDims.size();
|
||||
const size_t DIM0 = inDims[0];
|
||||
const size_t DIM1 = inDims[1];
|
||||
@ -217,18 +249,20 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() {
|
||||
auto src_data = reinterpret_cast<const uint8_t *>(parentEdge->getMemoryPtr()->GetPtr());
|
||||
auto dst_data = reinterpret_cast<uint8_t *>(childEdge->getMemoryPtr()->GetPtr());
|
||||
|
||||
const size_t stride0 = DIM1 * DIM2 * DIM3 * DIM4;
|
||||
const size_t src_batch_stride = DIM1 * DIM2 * DIM3 * DIM4;
|
||||
const size_t dst_batch_stride = dstStrides[0];
|
||||
const size_t dst_channel_stride = dstStrides[ndims-2];
|
||||
const size_t stride1 = DIM2 * DIM3 * DIM4;
|
||||
const size_t stride2 = DIM2 * DIM3;
|
||||
|
||||
parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) {
|
||||
size_t src_off = dim0 * stride0 + j * DIM4 + dim1 * stride1;
|
||||
size_t dst_off = dim0 * stride0 + j * DIM4 * DIM1 + dim1;
|
||||
size_t src_off = dim0 * src_batch_stride + j * DIM4 + dim1 * stride1;
|
||||
size_t dst_off = dim0 * dst_batch_stride + j * DIM4 * dst_channel_stride + dim1;
|
||||
|
||||
for (size_t dim4 = 0; dim4 < DIM4; ++dim4) {
|
||||
dst_data[dst_off] = src_data[src_off];
|
||||
src_off++;
|
||||
dst_off += DIM1;
|
||||
dst_off += dst_channel_stride;
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -248,15 +282,17 @@ void MKLDNNReorderNode::optimizedNspc2Ncsp() {
|
||||
auto src_data = reinterpret_cast<const float *>(parentEdge->getMemoryPtr()->GetPtr());
|
||||
auto dst_data = reinterpret_cast<float *>(childEdge->getMemoryPtr()->GetPtr());
|
||||
|
||||
const size_t stride1 = DIM2 * DIM3 * DIM4;
|
||||
const size_t stride0 = stride1 * DIM1;
|
||||
parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
|
||||
auto src_off = b*stride0 + j*DIM1;
|
||||
auto dst_off = b*stride0 + j;
|
||||
const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType<BlockedMemoryDesc>()->getStrides();
|
||||
const size_t block_size = DIM2 * DIM3 * DIM4;
|
||||
const size_t src_batch_stride = block_size * DIM1;
|
||||
const size_t dst_batch_stride = dstStrides[0];
|
||||
parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) {
|
||||
auto src_off = b * src_batch_stride + j * DIM1;
|
||||
auto dst_off = b * dst_batch_stride + j;
|
||||
for (size_t dim1 = 0; dim1 < DIM1; ++dim1) {
|
||||
dst_data[dst_off] = src_data[src_off];
|
||||
src_off++;
|
||||
dst_off += stride1;
|
||||
dst_off += block_size;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -73,6 +73,7 @@ private:
|
||||
bool isOptimized = false;
|
||||
|
||||
bool isNspc2NcspCase = false;
|
||||
bool isNcsp2NspcCase = false;
|
||||
bool canUseNspc2Ncsp = false;
|
||||
bool canUseNcsp2Nspc = false;
|
||||
|
||||
|
@ -9,6 +9,8 @@ addIeTargetTest(
|
||||
ROOT ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
INCLUDES
|
||||
${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin
|
||||
${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin/nodes
|
||||
$<TARGET_PROPERTY:openvino::conditional_compilation,INTERFACE_INCLUDE_DIRECTORIES>
|
||||
OBJECT_FILES
|
||||
$<TARGET_OBJECTS:MKLDNNPlugin_obj>
|
||||
LINK_LIBRARIES
|
||||
|
262
inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp
Normal file
262
inference-engine/tests/unit/cpu/nodes/mkldnn_reorder_node.cpp
Normal file
@ -0,0 +1,262 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <ie_common.h>
|
||||
|
||||
#include "mkldnn_reorder_node.h"
|
||||
#include "mkldnn_input_node.h"
|
||||
#include "mkldnn_edge.h"
|
||||
#include "mkldnn_node.h"
|
||||
/*
|
||||
* Test MKLDNNReorderNode::optimizedNcsp2Nspc() and MKLDNNReorderNode::optimizedNspc2Ncsp() for
|
||||
* inPlace and non-inPlace cases. Specifically, the test checks that dst batch strides are
|
||||
* correctly taken into account by the custom impls (the case when the reorder is followed by an inplace concat).
|
||||
*/
|
||||
typedef std::tuple<
|
||||
std::vector<size_t>, // srcDims
|
||||
bool> // forceInplace;
|
||||
ReorderCustomImplTestParamSet;
|
||||
|
||||
class ReorderCustomImplTestBase: public ::testing::Test {
|
||||
public:
|
||||
static std::string getTestCaseName(const testing::TestParamInfo<ReorderCustomImplTestParamSet> &obj) {
|
||||
std::vector<size_t> srcDims;
|
||||
bool inPlace;
|
||||
std::tie(srcDims, inPlace) = obj.param;
|
||||
std::ostringstream result;
|
||||
result << "IS:(";
|
||||
for (const auto s : srcDims)
|
||||
result << s << ".";
|
||||
result.seekp(-1, result.cur);
|
||||
result << ")";
|
||||
result << "_InPlace:" << inPlace;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
protected:
|
||||
void executeReorderNode(const void* srcData, void* dstData) {
|
||||
auto getBlockedDims = [](const std::vector<size_t>& dims, const std::vector<size_t>& order){
|
||||
std::vector<size_t> result;
|
||||
result.reserve(order.size());
|
||||
for (auto i : order)
|
||||
result.push_back(dims[i]);
|
||||
return result;
|
||||
};
|
||||
auto getStrides = [](const std::vector<size_t>& dims){
|
||||
std::vector<size_t> result(dims.size());
|
||||
result[dims.size() - 1] = 1;
|
||||
for (int i = dims.size() - 2; i >= 0; --i) {
|
||||
result[i] = result[i+1] * dims[i+1];
|
||||
}
|
||||
return result;
|
||||
};
|
||||
const mkldnn::engine cpuEngine(dnnl::engine::kind::cpu, 0);
|
||||
MKLDNNPlugin::MKLDNNWeightsSharing::Ptr weightsCache;
|
||||
|
||||
auto inputNode = std::make_shared<MKLDNNPlugin::MKLDNNInputNode>(MKLDNNPlugin::Shape(srcDims),
|
||||
prec,
|
||||
"Reorder_Input", "Input",
|
||||
cpuEngine, weightsCache);
|
||||
auto reorderNode = std::make_shared<MKLDNNPlugin::MKLDNNReorderNode>("Reorder", cpuEngine, weightsCache);
|
||||
auto outputNode = std::make_shared<MKLDNNPlugin::MKLDNNInputNode>(MKLDNNPlugin::Shape(dstDims),
|
||||
prec,
|
||||
"Reorder_Output", "Output",
|
||||
cpuEngine, weightsCache);
|
||||
|
||||
auto parentEdge = std::make_shared<MKLDNNPlugin::MKLDNNEdge>(inputNode, reorderNode, 0, 0);
|
||||
auto childEdge = std::make_shared<MKLDNNPlugin::MKLDNNEdge>(reorderNode, outputNode, 0, 0);
|
||||
parentEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation);
|
||||
childEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation);
|
||||
reorderNode->addEdge(parentEdge);
|
||||
reorderNode->addEdge(childEdge);
|
||||
|
||||
const std::vector<size_t> srcBlockedDims = getBlockedDims(srcDims, srcOrder);
|
||||
const std::vector<size_t> srcStrides = getStrides(srcBlockedDims);
|
||||
const std::vector<size_t> offsetPaddingToData(srcDims.size(), 0);
|
||||
|
||||
const std::vector<size_t> dstBlockedDims = getBlockedDims(dstDims, dstOrder);
|
||||
const std::vector<size_t> dstStrides = getStrides(dstBlockedDims);
|
||||
|
||||
const MKLDNNPlugin::CpuBlockedMemoryDesc inputDesc(prec, MKLDNNPlugin::Shape(srcDims),
|
||||
srcBlockedDims, srcOrder,
|
||||
0, offsetPaddingToData, srcStrides);
|
||||
|
||||
const MKLDNNPlugin::CpuBlockedMemoryDesc outputDesc(prec, MKLDNNPlugin::Shape(srcDims),
|
||||
getBlockedDims(srcDims, dstOrder), dstOrder,
|
||||
0, offsetPaddingToData, dstStrides);
|
||||
|
||||
auto parentMemory = std::make_shared<MKLDNNPlugin::MKLDNNMemory>(cpuEngine);
|
||||
auto childMemory = std::make_shared<MKLDNNPlugin::MKLDNNMemory>(cpuEngine);
|
||||
parentMemory->Create(inputDesc, srcData);
|
||||
childMemory->Create(outputDesc, dstData);
|
||||
parentEdge->reuse(parentMemory);
|
||||
childEdge->reuse(childMemory);
|
||||
|
||||
reorderNode->setDescs(inputDesc, outputDesc);
|
||||
std::vector<std::shared_ptr<MKLDNNPlugin::MKLDNNNode>> nodes {inputNode, reorderNode, outputNode};
|
||||
for (auto &n : nodes) {
|
||||
n->init();
|
||||
n->getSupportedDescriptors();
|
||||
n->initSupportedPrimitiveDescriptors();
|
||||
n->selectPrimitiveDescriptorByIndex(0);
|
||||
}
|
||||
auto config = outputNode->getSelectedPrimitiveDescriptor()->getConfig();
|
||||
config.inConfs.resize(1);
|
||||
config.inConfs[0].inPlace = forceInplace ? 0 : -1;
|
||||
outputNode->getSelectedPrimitiveDescriptor()->setConfig(config);
|
||||
reorderNode->createPrimitive();
|
||||
|
||||
mkldnn::stream strm(cpuEngine);
|
||||
reorderNode->execute(strm);
|
||||
return;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Run(const std::vector<T>& srcData, std::vector<T>& dstData) {
|
||||
fillData();
|
||||
executeReorderNode(srcData.data(), dstData.data());
|
||||
EXPECT_TRUE(resultIsCorrect(dstData));
|
||||
}
|
||||
// Fill srcData so that the results of NSPC2NCSP and NCSP2NSPC reorders are incremental numbers 0,1,2,...
|
||||
// Fill dstData with zeros
|
||||
virtual void fillData() = 0;
|
||||
template<typename T>
|
||||
bool resultIsCorrect(const std::vector<T>& dstData) {
|
||||
const size_t numElems = getNumElems(dstDims);
|
||||
auto b = dstData.begin();
|
||||
std::vector<T> expectedData(blockSize);
|
||||
for (int i = 0; i < numElems / blockSize; i++, b += blockSize) {
|
||||
if (i % 2 == 0) {
|
||||
std::iota(expectedData.begin(), expectedData.end(), i / 2 * blockSize);
|
||||
if (!std::equal(b, b + blockSize, expectedData.begin()))
|
||||
return false;
|
||||
} else if (!std::all_of(b, b + blockSize, [](T x){return x == 0;})) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
size_t getNumElems(const std::vector<size_t>& dims) {
|
||||
size_t result = 1;
|
||||
for (auto d : dims)
|
||||
result *= d;
|
||||
return result;
|
||||
}
|
||||
std::vector<size_t> srcDims;
|
||||
std::vector<size_t> srcOrder;
|
||||
std::vector<size_t> dstDims;
|
||||
std::vector<size_t> dstOrder;
|
||||
InferenceEngine::Precision prec;
|
||||
bool forceInplace;
|
||||
size_t blockSize;
|
||||
};
|
||||
|
||||
class ReorderNSPC2NCSPTest: public testing::WithParamInterface<ReorderCustomImplTestParamSet>,
|
||||
public ReorderCustomImplTestBase{
|
||||
protected:
|
||||
void SetUp() override {
|
||||
std::tie(srcDims, forceInplace) = this->GetParam();
|
||||
// The custom NSPC2NCSP impl is used only if an input shape complies with:
|
||||
assert(srcDims[1] <= 64 && srcDims[1] >= 16 && (getNumElems(srcDims) / srcDims[1]) >= 128);
|
||||
// The custom NSPC2NCSP impl is used only for FP32
|
||||
prec = InferenceEngine::Precision::FP32;
|
||||
srcOrder = std::vector<size_t> {0, 2, 3, 1};
|
||||
dstOrder = std::vector<size_t> {0, 1, 2, 3};
|
||||
dstDims = srcDims;
|
||||
blockSize = getNumElems(srcDims);
|
||||
// Create channel-strided dst layout for the inPlace case
|
||||
// Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly.
|
||||
if (forceInplace) {
|
||||
dstDims[1] *= 2;
|
||||
blockSize /= srcDims[0];
|
||||
}
|
||||
}
|
||||
void Run() {
|
||||
ReorderCustomImplTestBase::Run(srcData, dstData);
|
||||
}
|
||||
void fillData() override {
|
||||
dstData.resize(getNumElems(dstDims));
|
||||
std::fill(dstData.begin(), dstData.end(), 0);
|
||||
srcData.resize(getNumElems(srcDims));
|
||||
const int numChannels = srcDims[1];
|
||||
const int spBlockSize = srcDims[2] * srcDims[3];
|
||||
const int batchSize = spBlockSize * numChannels;
|
||||
int i = 0;
|
||||
for (int n = 0; n < getNumElems(srcDims); n += batchSize) {
|
||||
for (int sp = n; sp < n + spBlockSize; sp++) {
|
||||
for (int c = sp; c < sp + batchSize; c += spBlockSize) {
|
||||
srcData[i++] = static_cast<float>(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<float> dstData;
|
||||
std::vector<float> srcData;
|
||||
};
|
||||
|
||||
class ReorderNCSP2NSPCTest: public testing::WithParamInterface<ReorderCustomImplTestParamSet>,
|
||||
public ReorderCustomImplTestBase{
|
||||
protected:
|
||||
void SetUp() override {
|
||||
std::tie(srcDims, forceInplace) = this->GetParam();
|
||||
// Avoid uint8_t overflow or modify fillNCSP2NSPC() and resultIsCorrect()
|
||||
assert(getNumElems(srcDims) <= 256);
|
||||
srcOrder = std::vector<size_t> {0, 1, 2, 3};
|
||||
dstOrder = std::vector<size_t> {0, 2, 3, 1};
|
||||
// The custom NSPC2NCSP impl is used only for U8
|
||||
prec = InferenceEngine::Precision::U8;
|
||||
dstDims = srcDims;
|
||||
blockSize = getNumElems(srcDims);
|
||||
// Create channel-strided dst layout for the inPlace case
|
||||
// Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly.
|
||||
if (forceInplace) {
|
||||
dstDims[1] *= 2;
|
||||
blockSize = srcDims[1];
|
||||
}
|
||||
}
|
||||
void Run() {
|
||||
ReorderCustomImplTestBase::Run(srcData, dstData);
|
||||
}
|
||||
void fillData() override {
|
||||
dstData.resize(getNumElems(dstDims));
|
||||
std::fill(dstData.begin(), dstData.end(), 0);
|
||||
srcData.resize(getNumElems(srcDims));
|
||||
const int numChannels = srcDims[1];
|
||||
const int batchSize = srcDims[2] * srcDims[3] * numChannels;
|
||||
int i = 0;
|
||||
for (int n = 0; n < getNumElems(srcDims); n += batchSize) {
|
||||
for (int c = n; c < n + numChannels; c ++) {
|
||||
for (int sp = c; sp < c + batchSize; sp += numChannels) {
|
||||
srcData[i++] = static_cast<uint8_t>(sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<uint8_t> dstData;
|
||||
std::vector<uint8_t> srcData;
|
||||
};
|
||||
|
||||
TEST_P(ReorderNSPC2NCSPTest, NSPC2NCSP) {
|
||||
Run();
|
||||
}
|
||||
|
||||
TEST_P(ReorderNCSP2NSPCTest, NCSP2NSPC) {
|
||||
Run();
|
||||
}
|
||||
|
||||
const std::vector<bool> forceInplace {false, true};
|
||||
const auto NSPC2NCSPparams =::testing::Combine(
|
||||
::testing::Values(std::vector<size_t> {2, 16, 8, 8}),
|
||||
::testing::ValuesIn(forceInplace));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNSPC, ReorderNSPC2NCSPTest, NSPC2NCSPparams,
|
||||
ReorderCustomImplTestBase::getTestCaseName);
|
||||
|
||||
const auto NCSP2NSPCparams =::testing::Combine(
|
||||
::testing::Values(std::vector<size_t> {2, 8, 4, 4}),
|
||||
::testing::ValuesIn(forceInplace));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNCSP, ReorderNCSP2NSPCTest, NCSP2NSPCparams,
|
||||
ReorderCustomImplTestBase::getTestCaseName);
|
Loading…
Reference in New Issue
Block a user