[CPU] Bug/custom_inplace_reorders/fix unit (#7222)

This commit is contained in:
Ivan Novoselov 2021-10-18 18:31:23 +03:00 committed by GitHub
parent ac9c2f19dc
commit cf52ba5c08
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 318 additions and 17 deletions

View File

@ -71,7 +71,6 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
if (isDynamic && (config.inConfs[0].desc->getShape().getRank() != config.outConfs[0].desc->getShape().getRank()))
IE_THROW() << "Reorder node doesn't support case when input and output shapes have different rank and dynamic";
if (!isOptimized) {
const auto &inShape = getInputShapeAtPort(0);
if (MKLDNNPlugin::one_of(inShape.getRank(), 4, 5) &&
@ -88,7 +87,7 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].desc->getPrecision() == config.outConfs[0].desc->getPrecision() &&
config.inConfs[0].desc->getPrecision().size() == 1) {
// oneDNN doesn't provide JIT reorder impl for non-avx2 targets so we fallback on simple c++ implementation which shows better perf
canUseNcsp2Nspc = true;
isNcsp2NspcCase = true;
}
}
}
@ -116,14 +115,46 @@ void MKLDNNReorderNode::prepareParams() {
if (getSelectedPrimitiveDescriptor() == nullptr)
IE_THROW() << "Preferable primitive descriptor is not set.";
if (isNspc2NcspCase) {
auto isSupportedDesc = [](const MemoryDesc& desc) {
if (!desc.isDefined()) {
return false;
}
if (!(desc.getType() & MemoryDescType::Blocked)) {
return false;
}
if ((desc.getType() & MemoryDescType::Mkldnn) && !desc.as<const DnnlMemoryDesc>()->hasEmptyExtraData()) {
return false;
}
return true;
};
const auto& parentDesc = srcMemPtr->getDesc();
const auto& childDesc = dstMemPtr->getDesc();
if ((isNspc2NcspCase || isNcsp2NspcCase) && isSupportedDesc(childDesc) && isSupportedDesc(parentDesc)) {
const auto &inDims = srcMemPtr->getStaticDims();
// Check that child strides are consistent with parent dims if the child is inplace.
// The strides must be dense except for the channel one (since the child num channels might differ)
const auto childSubBlocksAreDense = [&]() {
const auto& dstStrides = childDesc.as<BlockedMemoryDesc>()->getStrides();
const auto& dstOrder = childDesc.as<BlockedMemoryDesc>()->getOrder();
const size_t channelDim = 1;
if (dstStrides.back() != 1)
return false;
for (int i = inDims.size() - 1; i > 0; i--) {
if (dstStrides[i-1] != dstStrides[i] * inDims[dstOrder[i]] && dstOrder[i] != channelDim)
return false;
}
return true;
};
if (isNspc2NcspCase) {
canUseNspc2Ncsp = inDims[1] <= 64 && inDims[1] >= 16 &&
(srcMemPtr->GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount() / inDims[1]) >= 128;
(parentDesc.as<BlockedMemoryDesc>()->getPaddedElementsCount() / inDims[1]) >= 128 &&
childSubBlocksAreDense();
} else if (isNcsp2NspcCase) {
canUseNcsp2Nspc = childSubBlocksAreDense();
}
}
if (!canUseNcsp2Nspc && !canUseNspc2Ncsp) {
auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
IE_THROW() << "Destination memory didn't allocate.";
if (!srcMemPtr || !srcMemPtr->GetPrimitivePtr())
@ -207,6 +238,7 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() {
auto childEdge = getChildEdgeAt(0);
auto inDims = parentEdge->getMemory().GetShape().getStaticDims();
const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType<BlockedMemoryDesc>()->getStrides();
const size_t ndims = inDims.size();
const size_t DIM0 = inDims[0];
const size_t DIM1 = inDims[1];
@ -217,18 +249,20 @@ void MKLDNNReorderNode::optimizedNcsp2Nspc() {
auto src_data = reinterpret_cast<const uint8_t *>(parentEdge->getMemoryPtr()->GetPtr());
auto dst_data = reinterpret_cast<uint8_t *>(childEdge->getMemoryPtr()->GetPtr());
const size_t stride0 = DIM1 * DIM2 * DIM3 * DIM4;
const size_t src_batch_stride = DIM1 * DIM2 * DIM3 * DIM4;
const size_t dst_batch_stride = dstStrides[0];
const size_t dst_channel_stride = dstStrides[ndims-2];
const size_t stride1 = DIM2 * DIM3 * DIM4;
const size_t stride2 = DIM2 * DIM3;
parallel_for3d(DIM0, DIM1, stride2, [&](size_t dim0, size_t dim1, size_t j) {
size_t src_off = dim0 * stride0 + j * DIM4 + dim1 * stride1;
size_t dst_off = dim0 * stride0 + j * DIM4 * DIM1 + dim1;
size_t src_off = dim0 * src_batch_stride + j * DIM4 + dim1 * stride1;
size_t dst_off = dim0 * dst_batch_stride + j * DIM4 * dst_channel_stride + dim1;
for (size_t dim4 = 0; dim4 < DIM4; ++dim4) {
dst_data[dst_off] = src_data[src_off];
src_off++;
dst_off += DIM1;
dst_off += dst_channel_stride;
}
});
}
@ -248,15 +282,17 @@ void MKLDNNReorderNode::optimizedNspc2Ncsp() {
auto src_data = reinterpret_cast<const float *>(parentEdge->getMemoryPtr()->GetPtr());
auto dst_data = reinterpret_cast<float *>(childEdge->getMemoryPtr()->GetPtr());
const size_t stride1 = DIM2 * DIM3 * DIM4;
const size_t stride0 = stride1 * DIM1;
parallel_for2d(DIM0, stride1, [&](size_t b, size_t j) {
auto src_off = b*stride0 + j*DIM1;
auto dst_off = b*stride0 + j;
const auto dstStrides = childEdge->getMemoryPtr()->GetDescWithType<BlockedMemoryDesc>()->getStrides();
const size_t block_size = DIM2 * DIM3 * DIM4;
const size_t src_batch_stride = block_size * DIM1;
const size_t dst_batch_stride = dstStrides[0];
parallel_for2d(DIM0, block_size, [&](size_t b, size_t j) {
auto src_off = b * src_batch_stride + j * DIM1;
auto dst_off = b * dst_batch_stride + j;
for (size_t dim1 = 0; dim1 < DIM1; ++dim1) {
dst_data[dst_off] = src_data[src_off];
src_off++;
dst_off += stride1;
dst_off += block_size;
}
});
}

View File

@ -73,6 +73,7 @@ private:
bool isOptimized = false;
bool isNspc2NcspCase = false;
bool isNcsp2NspcCase = false;
bool canUseNspc2Ncsp = false;
bool canUseNcsp2Nspc = false;

View File

@ -9,6 +9,8 @@ addIeTargetTest(
ROOT ${CMAKE_CURRENT_SOURCE_DIR}
INCLUDES
${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin
${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin/nodes
$<TARGET_PROPERTY:openvino::conditional_compilation,INTERFACE_INCLUDE_DIRECTORIES>
OBJECT_FILES
$<TARGET_OBJECTS:MKLDNNPlugin_obj>
LINK_LIBRARIES

View File

@ -0,0 +1,262 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include <ie_common.h>
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
#include "mkldnn_edge.h"
#include "mkldnn_node.h"
/*
* Test MKLDNNReorderNode::optimizedNcsp2Nspc() and MKLDNNReorderNode::optimizedNspc2Ncsp() for
* inPlace and non-inPlace cases. Specifically, the test checks that dst batch strides are
* correctly taken into account by the custom impls (the case when the reorder is followed by an inplace concat).
*/
typedef std::tuple<
std::vector<size_t>, // srcDims
bool> // forceInplace;
ReorderCustomImplTestParamSet;
class ReorderCustomImplTestBase: public ::testing::Test {
public:
static std::string getTestCaseName(const testing::TestParamInfo<ReorderCustomImplTestParamSet> &obj) {
std::vector<size_t> srcDims;
bool inPlace;
std::tie(srcDims, inPlace) = obj.param;
std::ostringstream result;
result << "IS:(";
for (const auto s : srcDims)
result << s << ".";
result.seekp(-1, result.cur);
result << ")";
result << "_InPlace:" << inPlace;
return result.str();
}
protected:
void executeReorderNode(const void* srcData, void* dstData) {
auto getBlockedDims = [](const std::vector<size_t>& dims, const std::vector<size_t>& order){
std::vector<size_t> result;
result.reserve(order.size());
for (auto i : order)
result.push_back(dims[i]);
return result;
};
auto getStrides = [](const std::vector<size_t>& dims){
std::vector<size_t> result(dims.size());
result[dims.size() - 1] = 1;
for (int i = dims.size() - 2; i >= 0; --i) {
result[i] = result[i+1] * dims[i+1];
}
return result;
};
const mkldnn::engine cpuEngine(dnnl::engine::kind::cpu, 0);
MKLDNNPlugin::MKLDNNWeightsSharing::Ptr weightsCache;
auto inputNode = std::make_shared<MKLDNNPlugin::MKLDNNInputNode>(MKLDNNPlugin::Shape(srcDims),
prec,
"Reorder_Input", "Input",
cpuEngine, weightsCache);
auto reorderNode = std::make_shared<MKLDNNPlugin::MKLDNNReorderNode>("Reorder", cpuEngine, weightsCache);
auto outputNode = std::make_shared<MKLDNNPlugin::MKLDNNInputNode>(MKLDNNPlugin::Shape(dstDims),
prec,
"Reorder_Output", "Output",
cpuEngine, weightsCache);
auto parentEdge = std::make_shared<MKLDNNPlugin::MKLDNNEdge>(inputNode, reorderNode, 0, 0);
auto childEdge = std::make_shared<MKLDNNPlugin::MKLDNNEdge>(reorderNode, outputNode, 0, 0);
parentEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation);
childEdge->changeStatus(MKLDNNPlugin::MKLDNNEdge::Status::NeedAllocation);
reorderNode->addEdge(parentEdge);
reorderNode->addEdge(childEdge);
const std::vector<size_t> srcBlockedDims = getBlockedDims(srcDims, srcOrder);
const std::vector<size_t> srcStrides = getStrides(srcBlockedDims);
const std::vector<size_t> offsetPaddingToData(srcDims.size(), 0);
const std::vector<size_t> dstBlockedDims = getBlockedDims(dstDims, dstOrder);
const std::vector<size_t> dstStrides = getStrides(dstBlockedDims);
const MKLDNNPlugin::CpuBlockedMemoryDesc inputDesc(prec, MKLDNNPlugin::Shape(srcDims),
srcBlockedDims, srcOrder,
0, offsetPaddingToData, srcStrides);
const MKLDNNPlugin::CpuBlockedMemoryDesc outputDesc(prec, MKLDNNPlugin::Shape(srcDims),
getBlockedDims(srcDims, dstOrder), dstOrder,
0, offsetPaddingToData, dstStrides);
auto parentMemory = std::make_shared<MKLDNNPlugin::MKLDNNMemory>(cpuEngine);
auto childMemory = std::make_shared<MKLDNNPlugin::MKLDNNMemory>(cpuEngine);
parentMemory->Create(inputDesc, srcData);
childMemory->Create(outputDesc, dstData);
parentEdge->reuse(parentMemory);
childEdge->reuse(childMemory);
reorderNode->setDescs(inputDesc, outputDesc);
std::vector<std::shared_ptr<MKLDNNPlugin::MKLDNNNode>> nodes {inputNode, reorderNode, outputNode};
for (auto &n : nodes) {
n->init();
n->getSupportedDescriptors();
n->initSupportedPrimitiveDescriptors();
n->selectPrimitiveDescriptorByIndex(0);
}
auto config = outputNode->getSelectedPrimitiveDescriptor()->getConfig();
config.inConfs.resize(1);
config.inConfs[0].inPlace = forceInplace ? 0 : -1;
outputNode->getSelectedPrimitiveDescriptor()->setConfig(config);
reorderNode->createPrimitive();
mkldnn::stream strm(cpuEngine);
reorderNode->execute(strm);
return;
}
template<typename T>
void Run(const std::vector<T>& srcData, std::vector<T>& dstData) {
fillData();
executeReorderNode(srcData.data(), dstData.data());
EXPECT_TRUE(resultIsCorrect(dstData));
}
// Fill srcData so that the results of NSPC2NCSP and NCSP2NSPC reorders are incremental numbers 0,1,2,...
// Fill dstData with zeros
virtual void fillData() = 0;
template<typename T>
bool resultIsCorrect(const std::vector<T>& dstData) {
const size_t numElems = getNumElems(dstDims);
auto b = dstData.begin();
std::vector<T> expectedData(blockSize);
for (int i = 0; i < numElems / blockSize; i++, b += blockSize) {
if (i % 2 == 0) {
std::iota(expectedData.begin(), expectedData.end(), i / 2 * blockSize);
if (!std::equal(b, b + blockSize, expectedData.begin()))
return false;
} else if (!std::all_of(b, b + blockSize, [](T x){return x == 0;})) {
return false;
}
}
return true;
}
size_t getNumElems(const std::vector<size_t>& dims) {
size_t result = 1;
for (auto d : dims)
result *= d;
return result;
}
std::vector<size_t> srcDims;
std::vector<size_t> srcOrder;
std::vector<size_t> dstDims;
std::vector<size_t> dstOrder;
InferenceEngine::Precision prec;
bool forceInplace;
size_t blockSize;
};
class ReorderNSPC2NCSPTest: public testing::WithParamInterface<ReorderCustomImplTestParamSet>,
public ReorderCustomImplTestBase{
protected:
void SetUp() override {
std::tie(srcDims, forceInplace) = this->GetParam();
// The custom NSPC2NCSP impl is used only if an input shape complies with:
assert(srcDims[1] <= 64 && srcDims[1] >= 16 && (getNumElems(srcDims) / srcDims[1]) >= 128);
// The custom NSPC2NCSP impl is used only for FP32
prec = InferenceEngine::Precision::FP32;
srcOrder = std::vector<size_t> {0, 2, 3, 1};
dstOrder = std::vector<size_t> {0, 1, 2, 3};
dstDims = srcDims;
blockSize = getNumElems(srcDims);
// Create channel-strided dst layout for the inPlace case
// Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly.
if (forceInplace) {
dstDims[1] *= 2;
blockSize /= srcDims[0];
}
}
void Run() {
ReorderCustomImplTestBase::Run(srcData, dstData);
}
void fillData() override {
dstData.resize(getNumElems(dstDims));
std::fill(dstData.begin(), dstData.end(), 0);
srcData.resize(getNumElems(srcDims));
const int numChannels = srcDims[1];
const int spBlockSize = srcDims[2] * srcDims[3];
const int batchSize = spBlockSize * numChannels;
int i = 0;
for (int n = 0; n < getNumElems(srcDims); n += batchSize) {
for (int sp = n; sp < n + spBlockSize; sp++) {
for (int c = sp; c < sp + batchSize; c += spBlockSize) {
srcData[i++] = static_cast<float>(c);
}
}
}
}
std::vector<float> dstData;
std::vector<float> srcData;
};
class ReorderNCSP2NSPCTest: public testing::WithParamInterface<ReorderCustomImplTestParamSet>,
public ReorderCustomImplTestBase{
protected:
void SetUp() override {
std::tie(srcDims, forceInplace) = this->GetParam();
// Avoid uint8_t overflow or modify fillNCSP2NSPC() and resultIsCorrect()
assert(getNumElems(srcDims) <= 256);
srcOrder = std::vector<size_t> {0, 1, 2, 3};
dstOrder = std::vector<size_t> {0, 2, 3, 1};
// The custom NSPC2NCSP impl is used only for U8
prec = InferenceEngine::Precision::U8;
dstDims = srcDims;
blockSize = getNumElems(srcDims);
// Create channel-strided dst layout for the inPlace case
// Other dstDims could also be supported, but fillData() and resultIsCorrect() should be updated accordingly.
if (forceInplace) {
dstDims[1] *= 2;
blockSize = srcDims[1];
}
}
void Run() {
ReorderCustomImplTestBase::Run(srcData, dstData);
}
void fillData() override {
dstData.resize(getNumElems(dstDims));
std::fill(dstData.begin(), dstData.end(), 0);
srcData.resize(getNumElems(srcDims));
const int numChannels = srcDims[1];
const int batchSize = srcDims[2] * srcDims[3] * numChannels;
int i = 0;
for (int n = 0; n < getNumElems(srcDims); n += batchSize) {
for (int c = n; c < n + numChannels; c ++) {
for (int sp = c; sp < c + batchSize; sp += numChannels) {
srcData[i++] = static_cast<uint8_t>(sp);
}
}
}
}
std::vector<uint8_t> dstData;
std::vector<uint8_t> srcData;
};
TEST_P(ReorderNSPC2NCSPTest, NSPC2NCSP) {
Run();
}
TEST_P(ReorderNCSP2NSPCTest, NCSP2NSPC) {
Run();
}
const std::vector<bool> forceInplace {false, true};
const auto NSPC2NCSPparams =::testing::Combine(
::testing::Values(std::vector<size_t> {2, 16, 8, 8}),
::testing::ValuesIn(forceInplace));
INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNSPC, ReorderNSPC2NCSPTest, NSPC2NCSPparams,
ReorderCustomImplTestBase::getTestCaseName);
const auto NCSP2NSPCparams =::testing::Combine(
::testing::Values(std::vector<size_t> {2, 8, 4, 4}),
::testing::ValuesIn(forceInplace));
INSTANTIATE_TEST_SUITE_P(smoke_ReorderTestCustomNCSP, ReorderNCSP2NSPCTest, NCSP2NSPCparams,
ReorderCustomImplTestBase::getTestCaseName);