[CPU] Split supported PD initialization refactoring plus tests for multiply port connections. (#4521)

This commit is contained in:
Maksim Kutakov 2021-04-26 15:18:26 +03:00 committed by GitHub
parent ed5313b2e1
commit 9f2a3d0edc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 92 additions and 100 deletions

View File

@ -4,6 +4,7 @@
#include "mkldnn_split_node.h"
#include "common/cpu_memcpy.h"
#include "common/tensor_desc_creator.h"
#include <legacy/ie_layers.h>
#include <vector>
#include <mkldnn_types.h>
@ -16,47 +17,6 @@ using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
static TensorDesc makePlainTensorDesc(const Precision& precision, const SizeVector& srcDims) {
SizeVector order(srcDims.size());
std::iota(order.begin(), order.end(), 0);
return TensorDesc(precision, srcDims, {srcDims, order});
}
static TensorDesc makePerChannelTensorDesc(const Precision& precision, const SizeVector& srcDims) {
constexpr size_t channelsPos = 1lu;
SizeVector order(srcDims.size());
std::iota(order.begin(), order.end(), 0);
SizeVector blkDims = srcDims;
if (srcDims.size() > 2) {
auto moveElementBack = [](SizeVector& vector, size_t indx) {
auto itr = vector.begin() + indx;
std::rotate(itr, itr + 1, vector.end());
};
moveElementBack(order, channelsPos);
moveElementBack(blkDims, channelsPos);
}
return TensorDesc(precision, srcDims, {blkDims, order});
}
static TensorDesc makeChannelBlockedTensorDesc(const Precision& precision, const SizeVector& srcDims, size_t blockSize) {
if (srcDims.size() < 2) {
IE_THROW() << "Can't create blocked tensor descriptor!";
}
constexpr size_t channelsPos = 1lu;
SizeVector order(srcDims.size());
std::iota(order.begin(), order.end(), 0);
order.push_back(channelsPos);
SizeVector blkDims = srcDims;
blkDims[1] = blkDims[1] / blockSize + (blkDims[1] % blockSize ? 1 : 0);
blkDims.push_back(blockSize);
return TensorDesc(precision, srcDims, {blkDims, order});
}
MKLDNNSplitNode::MKLDNNSplitNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
MKLDNNNode(layer, eng, cache) {}
@ -77,7 +37,6 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
}
void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
using TensorDescFactory = std::function<TensorDesc(const Precision&, const SizeVector&)>;
constexpr size_t channelsPos = 1lu;
if (!supportedPrimitiveDescriptors.empty())
@ -113,24 +72,49 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
if (dstFirstDims.size() != srcDims.size())
THROW_ERROR << "sizes of input blob and sum of output blobs are not equal.";
InferenceEngine::Precision inpPrecision = inpData->getPrecision();
auto outPrecision = inpPrecision; // the split layer doesn't convert precisions
// make primitive descriptor factory function for different configurations
bool dynBatchSupport = true;
if (axis < 1) {
dynBatchSupport = false;
}
auto makePdInfo = [dynBatchSupport](TensorDescFactory getTensorDesc, const Precision& precision, const MKLDNNDims& srcDims,
const std::vector<MKLDNNDims>& outDims, impl_desc_type type) -> PrimitiveDescInfo {
//Set plain and tailC formats
std::vector<TensorDescCreatorTypes> tdCreatorTypes{ TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc };
//Support channel blocked format
if (srcDims.ndims() > 2) {
for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c) }) {
SizeVector blkDims = srcDims.ToSizeVector();
if (blkDims[channelsPos] % item.first)
continue;
bool blocked = true;
for (size_t i = 0; i < outDims.size(); i++) {
if (outDims[i].ToSizeVector()[channelsPos] % item.first) {
blocked = false;
break;
}
}
if (blocked) {
tdCreatorTypes.push_back(item.second);
}
}
}
std::vector<size_t> pdIndexesToReuse;
auto& creatorsMap = TensorDescCreator::getCommonCreators();
auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast<unsigned>(srcDims.ndims()), tdCreatorTypes);
for (auto itr = itrRange.first; itr != itrRange.second; ++itr) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = dynBatchSupport;
config.inConfs.resize(1);
config.inConfs[0].inPlace = -1;
config.inConfs[0].constant = false;
config.inConfs[0].desc = getTensorDesc(precision, srcDims.ToSizeVector());
config.inConfs[0].desc = itr->second->createDesc(inpPrecision, srcDims.ToSizeVector());
config.outConfs.resize(outDims.size());
std::vector<memory::format_tag> outFormats;
@ -140,49 +124,22 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
config.outConfs[i].inPlace = -1;
config.outConfs[i].constant = false;
config.outConfs[i].desc = getTensorDesc(precision, o_Dims.ToSizeVector());
config.outConfs[i].desc = itr->second->createDesc(inpPrecision, o_Dims.ToSizeVector());
outFormats.push_back(MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat());
}
return {config, type, outFormats};
};
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFormats);
//Set plain format
supportedPrimitiveDescriptors.push_back(makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref));
//Set per channel format.
supportedPrimitiveDescriptors.push_back(makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref));
//Support channel blocked format
std::vector<size_t> blockedPdIndexes;
if (srcDims.ndims() > channelsPos) {
for (size_t sizeS : {8lu, 16lu}) {
SizeVector blkDims = srcDims.ToSizeVector();
if (blkDims[channelsPos] % sizeS)
continue;
bool blocked = true;
for (size_t i = 0; i < outDims.size(); i++) {
if (outDims[i].ToSizeVector()[channelsPos] % sizeS) {
blocked = false;
break;
}
}
if (blocked) {
using std::placeholders::_1;
using std::placeholders::_2;
supportedPrimitiveDescriptors.push_back(makePdInfo(std::bind(&makeChannelBlockedTensorDesc, _1, _2, sizeS),
inpPrecision, srcDims, outDims, impl_desc_type::ref));
blockedPdIndexes.push_back(supportedPrimitiveDescriptors.size() - 1);
if (itr->first == TensorDescCreatorTypes::ncsp) {
// at least the plain layout can be optimized inplace.
pdIndexesToReuse.emplace_back(supportedPrimitiveDescriptors.size() - 1);
} else if (itr->first == TensorDescCreatorTypes::nCsp8c || itr->first == TensorDescCreatorTypes::nCsp16c) {
if (axis < 2) {
pdIndexesToReuse.emplace_back(supportedPrimitiveDescriptors.size() - 1);
}
}
}
// Optimized inplace case
std::vector<size_t> pdIndexesToReuse(1, 0); // at least the first plain layout can be optimized inplace.
if (axis < 2) {
pdIndexesToReuse.insert(pdIndexesToReuse.end(), blockedPdIndexes.begin(), blockedPdIndexes.end());
}
for (auto refPdIndex : pdIndexesToReuse) {
const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig();
auto config = refConfig;
@ -220,12 +177,26 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
// Special nspc -> ncsp case when splitting channels
if (axis == 1 && (dstFirstDims.ndims() == 4 || dstFirstDims.ndims() == 5)) {
auto plain = makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
auto perChannel = makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
InferenceEngine::LayerConfig config;
plain.getConfig().inConfs[0].desc = perChannel.getConfig().inConfs[0].desc;
config.dynBatchSupport = dynBatchSupport;
config.inConfs.resize(1);
config.inConfs[0].inPlace = -1;
config.inConfs[0].constant = false;
config.inConfs[0].desc = creatorsMap.at(TensorDescCreatorTypes::nspc)->createDesc(inpPrecision, srcDims.ToSizeVector());
config.outConfs.resize(outDims.size());
supportedPrimitiveDescriptors.push_back(plain);
std::vector<memory::format_tag> outFormats;
for (size_t i = 0; i < outDims.size(); i++) {
auto o_Dims = outDims[i];
config.outConfs[i].inPlace = -1;
config.outConfs[i].constant = false;
config.outConfs[i].desc = creatorsMap.at(TensorDescCreatorTypes::ncsp)->createDesc(inpPrecision, o_Dims.ToSizeVector());
outFormats.push_back(MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat());
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFormats);
}
}
@ -362,9 +333,19 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() {
}
void MKLDNNSplitNode::selectOptimalPrimitiveDescriptor() {
if (implPriorities.size() > 0 && implPriorities[0] == impl_desc_type::ref) {
selectPrimitiveDescriptorByIndex(0);
return;
// Enforce the reference implementation for the planar layout if the implementation is in the impl priorities list.
// This is needed mostly for the testing purposes, since for the planar layout Split works always in place, we need to enforce
// the reference implementation when it is selected in a test to test that piece of code.
if (!implPriorities.empty() && implPriorities[0] == impl_desc_type::ref) {
auto plain = PartialBlkDesc::makePlain(getParentEdgeAt(0)->getDims().ToSizeVector());
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) {
auto& pd = supportedPrimitiveDescriptors[i];
if (PartialBlkDesc::extractFrom(pd.getConfig().inConfs[0].desc) == plain &&
impl_desc_type::ref == pd.getImplementationType()) {
selectPrimitiveDescriptorByIndex(static_cast<int>(i));
return;
}
}
}
//check the descriptors and select the ones that have the same data format as the input
@ -504,6 +485,7 @@ void MKLDNNSplitNode::prepareOptimizedParams() {
optimizedParams.srcDataOffsets[i] = optimizedParams.srcDataOffsets[i - 1] + optimizedParams.dataSize[i - 1];
}
}
void MKLDNNSplitNode::optimizedNspc2Ncsp(size_t MB) {
auto parentEdge = getParentEdgeAt(0);
const int ndims = parentEdge->getDims().ndims();

View File

@ -69,8 +69,14 @@ protected:
auto split = std::dynamic_pointer_cast<ngraph::opset5::Split>(ngraph::builder::makeSplit(paramOuts[0],
ngPrc, numSplits, axis));
ngraph::ResultVector results;
for (int i = 0; i < outIndices.size(); i++) {
results.push_back(std::make_shared<ngraph::opset5::Result>(split->output(outIndices[i])));
// This WA is necessary because result nodes connected to the same output of the split node (or any node) are deduplicated
// on the CNNNetwork level. It might not be needed when the CPU plugin moves completely to nGraph.
// This is still a single layer test since the Ceiling nodes are added only as a WA.
auto fakeMultiplication = std::make_shared<ngraph::opset5::Ceiling>(split->output(outIndices[i]));
results.push_back(std::make_shared<ngraph::opset5::Result>(fakeMultiplication));
}
split->get_rt_info() = getCPUInfo();
function = std::make_shared<ngraph::Function>(results, params, "split");
@ -117,13 +123,17 @@ const std::vector<Precision> netPrecisions = {
Precision::BF16
};
const std::vector<std::vector<size_t>> outIndices3 = {{0, 1, 2}, {0, 1, 1, 0, 2}, {0, 0, 0, 2}};
const std::vector<std::vector<size_t>> outIndices4 = {{0, 1, 2, 3}, {0, 1, 1, 0, 2, 3}, {0, 0, 0, 2, 3}};
INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
::testing::Combine(
::testing::Values(4),
::testing::Values(1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 28, 24, 9})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices4),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(perChannelsToPlanar_4D)),
SplitLayerCPUTest::getTestCaseName);
@ -134,7 +144,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
::testing::Values(1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 21, 24, 9, 15})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices3),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(perChannelsToPlanar_5D)),
SplitLayerCPUTest::getTestCaseName);
@ -145,7 +155,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8inPlace, SplitLayerCPUTest,
::testing::Values(0, 1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices3),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D)),
SplitLayerCPUTest::getTestCaseName);
@ -156,7 +166,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8, SplitLayerCPUTest,
::testing::Values(2, 3),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices3),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D_ref)),
SplitLayerCPUTest::getTestCaseName);
@ -167,7 +177,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16inPlace, SplitLayerCPUTest,
::testing::Values(0, 1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({4, 64, 32, 12})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices3),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_4D)),
SplitLayerCPUTest::getTestCaseName);
@ -178,7 +188,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16, SplitLayerCPUTest,
::testing::Values(2, 3),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({4, 64, 32, 12})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices4),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_4D_ref)),
SplitLayerCPUTest::getTestCaseName);
@ -189,7 +199,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8inPlace, SplitLayerCPUTest,
::testing::Values(0, 1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices3),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D)),
SplitLayerCPUTest::getTestCaseName);
@ -200,7 +210,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8, SplitLayerCPUTest,
::testing::Values(2, 3, 4),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices3),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D_ref)),
SplitLayerCPUTest::getTestCaseName);
@ -211,7 +221,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16inPlace, SplitLayerCPUTest,
::testing::Values(0, 1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({4, 64, 32, 12, 20})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices4),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_5D)),
SplitLayerCPUTest::getTestCaseName);
@ -222,7 +232,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16, SplitLayerCPUTest,
::testing::Values(2, 3, 4),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({4, 64, 32, 12, 20})),
::testing::Values(std::vector<size_t>({})),
::testing::ValuesIn(outIndices4),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_5D_ref)),
SplitLayerCPUTest::getTestCaseName);