[CPU] Split supported PD initialization refactoring plus tests for multiply port connections. (#4521)
This commit is contained in:
parent
ed5313b2e1
commit
9f2a3d0edc
@ -4,6 +4,7 @@
|
||||
|
||||
#include "mkldnn_split_node.h"
|
||||
#include "common/cpu_memcpy.h"
|
||||
#include "common/tensor_desc_creator.h"
|
||||
#include <legacy/ie_layers.h>
|
||||
#include <vector>
|
||||
#include <mkldnn_types.h>
|
||||
@ -16,47 +17,6 @@ using namespace mkldnn;
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace InferenceEngine;
|
||||
|
||||
static TensorDesc makePlainTensorDesc(const Precision& precision, const SizeVector& srcDims) {
|
||||
SizeVector order(srcDims.size());
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
return TensorDesc(precision, srcDims, {srcDims, order});
|
||||
}
|
||||
|
||||
static TensorDesc makePerChannelTensorDesc(const Precision& precision, const SizeVector& srcDims) {
|
||||
constexpr size_t channelsPos = 1lu;
|
||||
SizeVector order(srcDims.size());
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
SizeVector blkDims = srcDims;
|
||||
if (srcDims.size() > 2) {
|
||||
auto moveElementBack = [](SizeVector& vector, size_t indx) {
|
||||
auto itr = vector.begin() + indx;
|
||||
std::rotate(itr, itr + 1, vector.end());
|
||||
};
|
||||
|
||||
moveElementBack(order, channelsPos);
|
||||
moveElementBack(blkDims, channelsPos);
|
||||
}
|
||||
|
||||
return TensorDesc(precision, srcDims, {blkDims, order});
|
||||
}
|
||||
|
||||
static TensorDesc makeChannelBlockedTensorDesc(const Precision& precision, const SizeVector& srcDims, size_t blockSize) {
|
||||
if (srcDims.size() < 2) {
|
||||
IE_THROW() << "Can't create blocked tensor descriptor!";
|
||||
}
|
||||
|
||||
constexpr size_t channelsPos = 1lu;
|
||||
SizeVector order(srcDims.size());
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
order.push_back(channelsPos);
|
||||
|
||||
SizeVector blkDims = srcDims;
|
||||
blkDims[1] = blkDims[1] / blockSize + (blkDims[1] % blockSize ? 1 : 0);
|
||||
blkDims.push_back(blockSize);
|
||||
|
||||
return TensorDesc(precision, srcDims, {blkDims, order});
|
||||
}
|
||||
|
||||
MKLDNNSplitNode::MKLDNNSplitNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
|
||||
MKLDNNNode(layer, eng, cache) {}
|
||||
|
||||
@ -77,7 +37,6 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
|
||||
}
|
||||
|
||||
void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
|
||||
using TensorDescFactory = std::function<TensorDesc(const Precision&, const SizeVector&)>;
|
||||
constexpr size_t channelsPos = 1lu;
|
||||
|
||||
if (!supportedPrimitiveDescriptors.empty())
|
||||
@ -113,24 +72,49 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
|
||||
if (dstFirstDims.size() != srcDims.size())
|
||||
THROW_ERROR << "sizes of input blob and sum of output blobs are not equal.";
|
||||
|
||||
|
||||
InferenceEngine::Precision inpPrecision = inpData->getPrecision();
|
||||
auto outPrecision = inpPrecision; // the split layer doesn't convert precisions
|
||||
|
||||
// make primitive descriptor factory function for different configurations
|
||||
bool dynBatchSupport = true;
|
||||
if (axis < 1) {
|
||||
dynBatchSupport = false;
|
||||
}
|
||||
auto makePdInfo = [dynBatchSupport](TensorDescFactory getTensorDesc, const Precision& precision, const MKLDNNDims& srcDims,
|
||||
const std::vector<MKLDNNDims>& outDims, impl_desc_type type) -> PrimitiveDescInfo {
|
||||
|
||||
//Set plain and tailC formats
|
||||
std::vector<TensorDescCreatorTypes> tdCreatorTypes{ TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc };
|
||||
|
||||
//Support channel blocked format
|
||||
if (srcDims.ndims() > 2) {
|
||||
for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c) }) {
|
||||
SizeVector blkDims = srcDims.ToSizeVector();
|
||||
if (blkDims[channelsPos] % item.first)
|
||||
continue;
|
||||
|
||||
bool blocked = true;
|
||||
for (size_t i = 0; i < outDims.size(); i++) {
|
||||
if (outDims[i].ToSizeVector()[channelsPos] % item.first) {
|
||||
blocked = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (blocked) {
|
||||
tdCreatorTypes.push_back(item.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<size_t> pdIndexesToReuse;
|
||||
|
||||
auto& creatorsMap = TensorDescCreator::getCommonCreators();
|
||||
auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast<unsigned>(srcDims.ndims()), tdCreatorTypes);
|
||||
for (auto itr = itrRange.first; itr != itrRange.second; ++itr) {
|
||||
InferenceEngine::LayerConfig config;
|
||||
|
||||
config.dynBatchSupport = dynBatchSupport;
|
||||
config.inConfs.resize(1);
|
||||
config.inConfs[0].inPlace = -1;
|
||||
config.inConfs[0].constant = false;
|
||||
config.inConfs[0].desc = getTensorDesc(precision, srcDims.ToSizeVector());
|
||||
config.inConfs[0].desc = itr->second->createDesc(inpPrecision, srcDims.ToSizeVector());
|
||||
config.outConfs.resize(outDims.size());
|
||||
|
||||
std::vector<memory::format_tag> outFormats;
|
||||
@ -140,49 +124,22 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
config.outConfs[i].inPlace = -1;
|
||||
config.outConfs[i].constant = false;
|
||||
config.outConfs[i].desc = getTensorDesc(precision, o_Dims.ToSizeVector());
|
||||
config.outConfs[i].desc = itr->second->createDesc(inpPrecision, o_Dims.ToSizeVector());
|
||||
outFormats.push_back(MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat());
|
||||
}
|
||||
return {config, type, outFormats};
|
||||
};
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFormats);
|
||||
|
||||
//Set plain format
|
||||
supportedPrimitiveDescriptors.push_back(makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref));
|
||||
|
||||
//Set per channel format.
|
||||
supportedPrimitiveDescriptors.push_back(makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref));
|
||||
|
||||
//Support channel blocked format
|
||||
std::vector<size_t> blockedPdIndexes;
|
||||
if (srcDims.ndims() > channelsPos) {
|
||||
for (size_t sizeS : {8lu, 16lu}) {
|
||||
SizeVector blkDims = srcDims.ToSizeVector();
|
||||
if (blkDims[channelsPos] % sizeS)
|
||||
continue;
|
||||
|
||||
bool blocked = true;
|
||||
for (size_t i = 0; i < outDims.size(); i++) {
|
||||
if (outDims[i].ToSizeVector()[channelsPos] % sizeS) {
|
||||
blocked = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (blocked) {
|
||||
using std::placeholders::_1;
|
||||
using std::placeholders::_2;
|
||||
supportedPrimitiveDescriptors.push_back(makePdInfo(std::bind(&makeChannelBlockedTensorDesc, _1, _2, sizeS),
|
||||
inpPrecision, srcDims, outDims, impl_desc_type::ref));
|
||||
blockedPdIndexes.push_back(supportedPrimitiveDescriptors.size() - 1);
|
||||
if (itr->first == TensorDescCreatorTypes::ncsp) {
|
||||
// at least the plain layout can be optimized inplace.
|
||||
pdIndexesToReuse.emplace_back(supportedPrimitiveDescriptors.size() - 1);
|
||||
} else if (itr->first == TensorDescCreatorTypes::nCsp8c || itr->first == TensorDescCreatorTypes::nCsp16c) {
|
||||
if (axis < 2) {
|
||||
pdIndexesToReuse.emplace_back(supportedPrimitiveDescriptors.size() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optimized inplace case
|
||||
std::vector<size_t> pdIndexesToReuse(1, 0); // at least the first plain layout can be optimized inplace.
|
||||
if (axis < 2) {
|
||||
pdIndexesToReuse.insert(pdIndexesToReuse.end(), blockedPdIndexes.begin(), blockedPdIndexes.end());
|
||||
}
|
||||
|
||||
for (auto refPdIndex : pdIndexesToReuse) {
|
||||
const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig();
|
||||
auto config = refConfig;
|
||||
@ -220,12 +177,26 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
|
||||
|
||||
// Special nspc -> ncsp case when splitting channels
|
||||
if (axis == 1 && (dstFirstDims.ndims() == 4 || dstFirstDims.ndims() == 5)) {
|
||||
auto plain = makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
|
||||
auto perChannel = makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
|
||||
InferenceEngine::LayerConfig config;
|
||||
|
||||
plain.getConfig().inConfs[0].desc = perChannel.getConfig().inConfs[0].desc;
|
||||
config.dynBatchSupport = dynBatchSupport;
|
||||
config.inConfs.resize(1);
|
||||
config.inConfs[0].inPlace = -1;
|
||||
config.inConfs[0].constant = false;
|
||||
config.inConfs[0].desc = creatorsMap.at(TensorDescCreatorTypes::nspc)->createDesc(inpPrecision, srcDims.ToSizeVector());
|
||||
config.outConfs.resize(outDims.size());
|
||||
|
||||
supportedPrimitiveDescriptors.push_back(plain);
|
||||
std::vector<memory::format_tag> outFormats;
|
||||
|
||||
for (size_t i = 0; i < outDims.size(); i++) {
|
||||
auto o_Dims = outDims[i];
|
||||
|
||||
config.outConfs[i].inPlace = -1;
|
||||
config.outConfs[i].constant = false;
|
||||
config.outConfs[i].desc = creatorsMap.at(TensorDescCreatorTypes::ncsp)->createDesc(inpPrecision, o_Dims.ToSizeVector());
|
||||
outFormats.push_back(MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat());
|
||||
}
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFormats);
|
||||
}
|
||||
}
|
||||
|
||||
@ -362,9 +333,19 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() {
|
||||
}
|
||||
|
||||
void MKLDNNSplitNode::selectOptimalPrimitiveDescriptor() {
|
||||
if (implPriorities.size() > 0 && implPriorities[0] == impl_desc_type::ref) {
|
||||
selectPrimitiveDescriptorByIndex(0);
|
||||
return;
|
||||
// Enforce the reference implementation for the planar layout if the implementation is in the impl priorities list.
|
||||
// This is needed mostly for the testing purposes, since for the planar layout Split works always in place, we need to enforce
|
||||
// the reference implementation when it is selected in a test to test that piece of code.
|
||||
if (!implPriorities.empty() && implPriorities[0] == impl_desc_type::ref) {
|
||||
auto plain = PartialBlkDesc::makePlain(getParentEdgeAt(0)->getDims().ToSizeVector());
|
||||
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) {
|
||||
auto& pd = supportedPrimitiveDescriptors[i];
|
||||
if (PartialBlkDesc::extractFrom(pd.getConfig().inConfs[0].desc) == plain &&
|
||||
impl_desc_type::ref == pd.getImplementationType()) {
|
||||
selectPrimitiveDescriptorByIndex(static_cast<int>(i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//check the descriptors and select the ones that have the same data format as the input
|
||||
@ -504,6 +485,7 @@ void MKLDNNSplitNode::prepareOptimizedParams() {
|
||||
optimizedParams.srcDataOffsets[i] = optimizedParams.srcDataOffsets[i - 1] + optimizedParams.dataSize[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
void MKLDNNSplitNode::optimizedNspc2Ncsp(size_t MB) {
|
||||
auto parentEdge = getParentEdgeAt(0);
|
||||
const int ndims = parentEdge->getDims().ndims();
|
||||
|
@ -69,8 +69,14 @@ protected:
|
||||
auto split = std::dynamic_pointer_cast<ngraph::opset5::Split>(ngraph::builder::makeSplit(paramOuts[0],
|
||||
ngPrc, numSplits, axis));
|
||||
ngraph::ResultVector results;
|
||||
|
||||
for (int i = 0; i < outIndices.size(); i++) {
|
||||
results.push_back(std::make_shared<ngraph::opset5::Result>(split->output(outIndices[i])));
|
||||
// This WA is necessary because result nodes connected to the same output of the split node (or any node) are deduplicated
|
||||
// on the CNNNetwork level. It might not be needed when the CPU plugin moves completely to nGraph.
|
||||
// This is still a single layer test since the Ceiling nodes are added only as a WA.
|
||||
|
||||
auto fakeMultiplication = std::make_shared<ngraph::opset5::Ceiling>(split->output(outIndices[i]));
|
||||
results.push_back(std::make_shared<ngraph::opset5::Result>(fakeMultiplication));
|
||||
}
|
||||
split->get_rt_info() = getCPUInfo();
|
||||
function = std::make_shared<ngraph::Function>(results, params, "split");
|
||||
@ -117,13 +123,17 @@ const std::vector<Precision> netPrecisions = {
|
||||
Precision::BF16
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> outIndices3 = {{0, 1, 2}, {0, 1, 1, 0, 2}, {0, 0, 0, 2}};
|
||||
const std::vector<std::vector<size_t>> outIndices4 = {{0, 1, 2, 3}, {0, 1, 1, 0, 2, 3}, {0, 0, 0, 2, 3}};
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(4),
|
||||
::testing::Values(1),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({3, 28, 24, 9})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices4),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(perChannelsToPlanar_4D)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -134,7 +144,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
|
||||
::testing::Values(1),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({3, 21, 24, 9, 15})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices3),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(perChannelsToPlanar_5D)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -145,7 +155,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8inPlace, SplitLayerCPUTest,
|
||||
::testing::Values(0, 1),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices3),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -156,7 +166,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8, SplitLayerCPUTest,
|
||||
::testing::Values(2, 3),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices3),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D_ref)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -167,7 +177,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16inPlace, SplitLayerCPUTest,
|
||||
::testing::Values(0, 1),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({4, 64, 32, 12})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices3),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_4D)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -178,7 +188,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16, SplitLayerCPUTest,
|
||||
::testing::Values(2, 3),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({4, 64, 32, 12})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices4),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_4D_ref)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -189,7 +199,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8inPlace, SplitLayerCPUTest,
|
||||
::testing::Values(0, 1),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices3),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -200,7 +210,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8, SplitLayerCPUTest,
|
||||
::testing::Values(2, 3, 4),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices3),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D_ref)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -211,7 +221,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16inPlace, SplitLayerCPUTest,
|
||||
::testing::Values(0, 1),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({4, 64, 32, 12, 20})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices4),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_5D)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
@ -222,7 +232,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16, SplitLayerCPUTest,
|
||||
::testing::Values(2, 3, 4),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(std::vector<size_t>({4, 64, 32, 12, 20})),
|
||||
::testing::Values(std::vector<size_t>({})),
|
||||
::testing::ValuesIn(outIndices4),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_5D_ref)),
|
||||
SplitLayerCPUTest::getTestCaseName);
|
||||
|
Loading…
Reference in New Issue
Block a user