[CPU] Extend Concat node logic to avoid fallback on slow ref implementation. (#4129)

This commit is contained in:
Maksim Kutakov 2021-05-31 18:49:57 +03:00 committed by GitHub
parent 315c8d4eec
commit 7fb9bac24a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 411 additions and 339 deletions

View File

@ -134,6 +134,16 @@ PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &d
return res;
}
PartialBlkDesc PartialBlkDesc::makeTailC(const InferenceEngine::SizeVector &dims) {
PartialBlkDesc res = makePlain(dims);
if (dims.size() > 2) {
auto itr = res.outer_order.begin() + 1;
std::rotate(itr, itr + 1, res.outer_order.end());
}
return res;
}
PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
if (desc.getLayout() == InferenceEngine::ANY)
IE_THROW() << "Cannot extract partial blocked descriptor for `ANY` layout";

View File

@ -59,6 +59,9 @@ public:
/** Construct blocked Channel PartialBlkDesc based on dims information */
static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);
/** Construct per Channel PartialBlkDesc based on dims information */
static PartialBlkDesc makeTailC(const InferenceEngine::SizeVector &dims);
/** Compare operators. Allow to use it as key for std::map */
bool operator == (const PartialBlkDesc& it) const;
bool operator < (const PartialBlkDesc& it) const;

View File

@ -21,11 +21,15 @@
#include "mkldnn_eltwise_node.h"
#include <limits>
#include "common/cpu_memcpy.h"
#include "common/tensor_desc_creator.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
namespace {
constexpr size_t channelAxis = 1lu;
}
bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
try {
@ -89,308 +93,120 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
}
}
// MKLDNN doesn't support different precision on inputs so fallback on FP32 in such case
// Concat doesn't support different precision on inputs so fallback on FP32 in such case
if (isMixedPrecision)
inputPrecision = Precision::FP32;
// Concat node supports int8 implementations only for NHWC and NDHWC layouts
if (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) {
int ndims = getChildEdgeAt(0)->getDims().ndims();
if (ndims != 2 && ndims != 4 && ndims != 5)
inputPrecision = Precision::FP32;
}
// MKLDNN supports only equal precisions for inputs and output
// Concat supports only equal precisions for inputs and output
outputPrecision = inputPrecision;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
auto& dstDims = getChildEdgeAt(0)->getDims();
std::vector<TensorDescCreatorTypes> tdCreatorTypes = {TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc};
MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
// check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow oneDNN ref implementation
if (dstDims.ndims() > channelAxis) {
for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c)}) {
SizeVector blkDims = dstDims.ToSizeVector();
if (blkDims[channelAxis] % item.first)
continue;
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format_tag::nc :
parentEdge->getDims().ndims() == 4 ? memory::format_tag::nhwc :
memory::format_tag::ndhwc
: memory::format_tag::any;
dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, fmt));
config.inConfs.push_back(dataConfig);
}
auto dims = getChildEdgeAt(0)->getDims();
config.outConfs.resize(1);
config.outConfs[0].inPlace = -1;
config.outConfs[0].constant = false;
if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format_tag::nc :
dims.ndims() == 4 ? memory::format_tag::nhwc :
memory::format_tag::ndhwc
: MKLDNNMemory::GetPlainFormat(dims);
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, fmt));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, fmt);
if (inputPrecision != Precision::U8 && inputPrecision != Precision::I8) {
if (dims.ndims() == 4) {
if (dims[1] % 8 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, memory::format_tag::nChw8c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, memory::format_tag::nChw8c);
if (dims[1] % 16 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nChw16c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nChw16c);
}
}
} else if (dims.ndims() == 5) {
if (dims[1] % 8 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw8c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw8c);
if (dims[1] % 16 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw16c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw16c);
}
bool blocked = true;
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto& srcDims = getParentEdgeAt(i)->getDims();
if (srcDims[channelAxis] % item.first) {
blocked = false;
break;
}
}
if (blocked) {
tdCreatorTypes.push_back(item.second);
}
}
}
if (axis != 1)
std::vector<size_t> pdIndexesToReuse;
auto& creatorsMap = TensorDescCreator::getCommonCreators();
auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast<unsigned>(dstDims.ndims()), tdCreatorTypes);
for (auto itr = itrRange.first; itr != itrRange.second; ++itr) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
config.outConfs.resize(1);
config.outConfs[0].inPlace = -1;
config.outConfs[0].constant = false;
config.outConfs[0].desc = itr->second->createDesc(outputPrecision, dstDims.ToSizeVector());
memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat();
config.inConfs.resize(getParentEdges().size());
for (size_t i = 0; i < getParentEdges().size(); ++i) {
config.inConfs[i].inPlace = -1;
config.inConfs[i].constant = false;
config.inConfs[i].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
itr->second->createDesc(inputPrecision, getParentEdgeAt(i)->getDims().ToSizeVector()));
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFmt);
if (itr->first != TensorDescCreatorTypes::nspc) {
pdIndexesToReuse.push_back(supportedPrimitiveDescriptors.size() - 1);
}
}
if (axis != channelAxis)
return;
auto numOfDim = static_cast<size_t>(dstDims.ndims());
// Optimized inplace case
SizeVector order(numOfDim);
SizeVector offsets(numOfDim, 0lu);
size_t offset = (std::numeric_limits<size_t>::max)();
for (size_t i = 0; i < numOfDim; i++) {
order[i] = i;
}
for (auto refPdIndex : pdIndexesToReuse) {
const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig();
auto config = refConfig;
if (outputPrecision == Precision::I8 || outputPrecision == Precision::U8) {
if (numOfDim == 4) {
// Here we assume NHWC layout (channels are the last)
const auto& order = refConfig.outConfs[0].desc.getBlockingDesc().getOrder();
const auto& blkDims = refConfig.outConfs[0].desc.getBlockingDesc().getBlockDims();
auto numOfDim = blkDims.size();
order = {0, 2, 3, 1};
offsets = {0, 0, 0, 0};
SizeVector offsets(numOfDim, 0lu);
SizeVector strides(numOfDim);
strides.back() = 1lu;
size_t offset = (std::numeric_limits<size_t>::max)();
SizeVector blkDims = dstDims.ToSizeVector();
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
SizeVector strides(numOfDim);
strides.resize(numOfDim);
// C is the last in NHWC, so all strides are max()
for (size_t i = 0; i < numOfDim; i++) {
strides[i] = (std::numeric_limits<size_t>::max)();
}
config.outConfs[0].desc = TensorDesc(outputPrecision,
dstDims.ToSizeVector(),
{ blkDims, order, offset, offsets, strides });
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
SizeVector blkDims = parentEdge->getDims().ToSizeVector();
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NHWC in mkldnn
config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(),
{blkDims, order, offset, offsets, strides});
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nhwc);
return;
} else if (numOfDim == 5) {
// Here we assume NDHWC layout (channels are the last)
order = {0, 2, 3, 4, 1};
offsets = {0, 0, 0, 0, 0};
SizeVector blkDims = dstDims.ToSizeVector();
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] };
SizeVector strides(numOfDim);
strides.resize(numOfDim);
// C is the last in NDHWC, so all strides are max()
for (size_t i = 0; i < numOfDim; i++) {
strides[i] = (std::numeric_limits<size_t>::max)();
}
config.outConfs[0].desc = TensorDesc(outputPrecision,
dstDims.ToSizeVector(),
{ blkDims, order, offset, offsets, strides });
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
SizeVector blkDims = parentEdge->getDims().ToSizeVector();
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] };
config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NDHWC in mkldnn
config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(),
{blkDims, order, offset, offsets, strides});
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::ndhwc);
return;
}
}
SizeVector strides(numOfDim);
strides[numOfDim - 1] = 1;
for (size_t i = 2; i <= numOfDim; i++) {
if (numOfDim - i < axis) {
strides[numOfDim - i] = (std::numeric_limits<size_t>::max)();
} else {
strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1];
}
}
config.outConfs[0].desc = TensorDesc(
MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
dstDims.ToSizeVector(),
{dstDims.ToSizeVector(), order, offset, offsets, strides});
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
config.inConfs[i].inPlace = 0;
config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
{parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides});
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, MKLDNNMemory::Convert(config.outConfs[0].desc.getLayout()));
if (numOfDim == 4lu || numOfDim == 5lu) {
size_t blkDimsLen = numOfDim + 1;
order.resize(blkDimsLen);
for (size_t i = 0; i < numOfDim; i++) {
order[i] = i;
}
order[numOfDim] = 1lu;
offsets = SizeVector(blkDimsLen, 0lu);
// nChw8c, nChw16c, nCdhw8c, nCdhw16c
for (size_t sizeS : {8lu, 16lu}) {
SizeVector blkDims = dstDims.ToSizeVector();
if (blkDims[1] % sizeS)
continue;
blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
blkDims.push_back(sizeS);
strides.resize(blkDimsLen);
strides[blkDimsLen - 1] = 1;
for (size_t i = 2lu; i <= blkDimsLen; i++) {
if (blkDimsLen - i < axis) {
strides[blkDimsLen - i] = (std::numeric_limits<size_t>::max)();
} else {
strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
}
}
config.outConfs[0].desc = TensorDesc(
MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
bool canInplace = true;
for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
blkDims = parentEdge->getDims().ToSizeVector();
if (blkDims[1] % sizeS)
canInplace = false;
blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
blkDims.push_back(sizeS);
config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
{blkDims, order, offset, offsets, strides});
}
if (canInplace) {
auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::nChw16c
: sizeS == 8lu ? mkldnn::memory::format_tag::nCdhw8c : mkldnn::memory::format_tag::nCdhw16c;
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, dstFormat);
for (size_t i = 2; i <= numOfDim; i++) {
if (numOfDim - i < axis) {
strides[numOfDim - i] = (std::numeric_limits<size_t>::max)();
} else {
strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1];
}
}
config.outConfs[0].desc = TensorDesc(outputPrecision, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat();
for (size_t i = 0; i < getParentEdges().size(); i++) {
const auto& srcBlkDims = refConfig.inConfs[i].desc.getBlockingDesc().getBlockDims();
const auto& dims = refConfig.inConfs[i].desc.getDims();
config.inConfs[i].inPlace = 0;
config.inConfs[i].desc = TensorDesc(inputPrecision, dims, {srcBlkDims, order, offset, offsets, strides});
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFmt);
}
}
void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
bool hasUnknown = false;
std::vector<size_t> canSelectPrimitive;
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
bool hasAny = true;
auto &primDescInfo = supportedPrimitiveDescriptors[i];
if (primDescInfo.getImplementationType() != impl_desc_type::unknown ||
primDescInfo.getConfig().inConfs[0].inPlace < 0)
continue;
hasUnknown = true;
for (auto iInfo : primDescInfo.getConfig().inConfs) {
if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
hasAny = false;
break;
}
}
if (hasAny) {
for (auto oInfo : primDescInfo.getConfig().outConfs) {
if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
hasAny = false;
break;
}
}
}
if (!hasAny) {
canSelectPrimitive.push_back(i);
}
}
bool hasDoubleConnection = false;
for (int i = 0; i < getParentEdges().size(); i++) {
for (int j = i + 1; j < getParentEdges().size(); j++) {
if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true;
}
}
if (hasDoubleConnection) {
// The double connection marks that some tensor should
// be replicated. Inplace approach is not applicable
// for that case. Descriptor with index 0 is pure copy
// implementation
selectPrimitiveDescriptorByIndex(0);
return;
}
bool canOptimize = true;
for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
const auto& parent = getParentEdgeAt(i)->getParent();
for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) {
const auto& child = parent->getChildEdgeAt(j)->getChild();
const auto* childConcat = dynamic_cast<MKLDNNConcatNode *>(child.get());
if (!childConcat || childConcat == this)
continue;
if (childConcat->isOptimized())
canOptimize = false;
// The double connection marks that some tensor should
// be replicated. Inplace approach is not applicable
// for that case.
for (int i = 0; i < getParentEdges().size(); i++) {
for (int j = i + 1; j < getParentEdges().size(); j++) {
if (getParentEdgeAt(i) == getParentEdgeAt(j)) canOptimize = false;
}
}
if (hasUnknown && axis == 1) {
if (canSelectPrimitive.size() == 1) {
selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
return;
}
} else {
if (axis != channelAxis) {
canOptimize = false;
}
@ -432,44 +248,57 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
}
size_t maxCount = 0;
auto convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
auto outDims = getChildEdgeAt(0)->getDims().ToSizeVector();
auto convertTo = PartialBlkDesc::makePlain(outDims);
for (auto &it : formatFrequency) {
if (it.second > maxCount) {
maxCount = it.second;
convertTo = it.first;
} else if (it.second == maxCount) {
if (isInQuantizedGraph && it.first == PartialBlkDesc::makeTailC(outDims)) {
convertTo = it.first;
} else if (it.first == PartialBlkDesc::makeCBlocked(outDims, 8) || it.first == PartialBlkDesc::makeCBlocked(outDims, 16)) {
convertTo = it.first;
}
}
}
if (canOptimize && convertTo.isAutoExtendedWith(getChildEdgeAt(0)->getDims().ToSizeVector()))
convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
if (convertTo.isAutoExtendedWith(outDims))
convertTo = PartialBlkDesc::makePlain(outDims);
for (size_t i = 0; i < getParentEdges().size(); i++) {
if (convertTo.isAutoExtendedWith(getParentEdgeAt(i)->getDims().ToSizeVector()))
convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
convertTo = PartialBlkDesc::makePlain(outDims);
}
for (auto supportedPdIndex : canSelectPrimitive) {
if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc) == convertTo) {
selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) {
if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc) == convertTo) {
if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, canOptimize)) {
canSelectPrimitive.push_back(i);
}
}
}
if (canSelectPrimitive.size() == 1) {
selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
return;
}
// if there are more then one PD with similar data layouts - select the optimized one
for (auto indx : canSelectPrimitive) {
if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) {
selectPrimitiveDescriptorByIndex(static_cast<int>(indx));
return;
}
}
// if there are no matching data layouts, select first optimized implementation
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
auto &primDescInfo = supportedPrimitiveDescriptors[i];
if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
continue;
if (convertTo == PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc)) {
size_t num = 0;
for (num = 0; num < getParentEdges().size(); num++) {
if (convertTo.isAutoExtendedWith(getParentEdgeAt(num)->getDims().ToSizeVector()))
break;
}
if (num == getParentEdges().size()) {
selectPrimitiveDescriptorByIndex(i);
return;
}
if (canOptimize && supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown) {
selectPrimitiveDescriptorByIndex(static_cast<int>(i));
return;
}
}
selectPrimitiveDescriptorByIndex(0);
}
@ -491,6 +320,12 @@ void MKLDNNConcatNode::createPrimitive() {
if (getSelectedPrimitiveDescriptor() == nullptr)
IE_THROW() << "Preferable primitive descriptor is not set.";
//check if selected Tensor descriptor has nspc layout and concat axis is C
if (axis == channelAxis && getChildEdgeAt(0)->getMemory().GetDesc().isTailCFormat()) {
canOptimizeNspc = true;
return;
}
std::vector<memory::desc> srcs_d;
for (size_t i = 0; i < getParentEdges().size(); i++) {
@ -540,7 +375,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
if (!isInitConfig(config)) {
for (size_t i = 0; i < config.inConfs.size(); i++) {
config.inConfs[i].desc = getConfiguredInputDesc(config, i);
// MKLDNN doesn't support different precision on inputs
// Concat doesn't support different precision on inputs
config.inConfs[i].desc.setPrecision(inputPrecision);
}
@ -560,8 +395,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
return;
for (size_t i = 0; i < config.outConfs.size(); i++) {
if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY ||
!isUninitTensorDesc(config.outConfs[i].desc))
if (!isUninitTensorDesc(config.outConfs[i].desc))
continue;
int num = getChildEdgeAt(i)->getOutputNum();
@ -621,49 +455,53 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {
return;
}
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
const size_t num_src = getParentEdges().size();
const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
if (isInt8) {
uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
std::vector<size_t> channels;
size_t channels_size = 0;
std::vector<const uint8_t*> src_ptrs;
std::vector<uint8_t*> dst_ptrs;
for (size_t i = 0; i < num_src; i++) {
const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
const size_t num_channels = src_mem.GetDims()[1];
channels.push_back(num_channels);
src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
dst_ptrs.push_back(dst_ptr + channels_size);
channels_size += num_channels;
}
const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0];
parallel_for(iter_count, [&](int i) {
const size_t dst_off = i * channels_size;
for (int j = 0; j < num_src; j++) {
cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]);
}
});
} else {
std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
for (int i = 0; i < num_src; i++)
mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
(*prim).execute(strm, mem_ags);
if (canOptimizeNspc) {
execNspcSpecCase();
return;
}
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
const size_t num_src = getParentEdges().size();
std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
for (int i = 0; i < num_src; i++)
mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
(*prim).execute(strm, mem_ags);
}
InferenceEngine::Precision MKLDNNConcatNode::getRuntimePrecision() const {
return MKLDNNExtensionUtils::getMaxPrecision(getInputPrecisions());
}
void MKLDNNConcatNode::execNspcSpecCase() {
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
const size_t num_src = getParentEdges().size();
uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
const size_t dataSize = MKLDNNExtensionUtils::sizeOfDataType(dst_memory.GetDataType());
std::vector<size_t> channelsDataSize;
size_t channels_size = 0;
std::vector<const uint8_t*> src_ptrs;
std::vector<uint8_t*> dst_ptrs;
for (size_t i = 0; i < num_src; i++) {
const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
const size_t num_channels = src_mem.GetDims()[channelAxis];
channelsDataSize.push_back(num_channels * dataSize);
src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
dst_ptrs.push_back(dst_ptr + channels_size);
channels_size += num_channels * dataSize;
}
const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channelsDataSize[0];
parallel_for(iter_count, [&](int i) {
const size_t dst_off = i * channels_size;
for (int j = 0; j < num_src; j++) {
cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channelsDataSize[j], channelsDataSize[j]);
}
});
}
REG_MKLDNN_PRIM_FOR(MKLDNNConcatNode, Concatenation);

View File

@ -30,8 +30,10 @@ public:
private:
size_t axis = 0;
bool canOptimizeNspc = false;
size_t inverseOrder(const InferenceEngine::SizeVector& order, size_t axis);
void execNspcSpecCase();
InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;

View File

@ -94,11 +94,15 @@ void MKLDNNShuffleChannelsNode::initSupportedPrimitiveDescriptors() {
impl_type = impl_desc_type::ref;
}
addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision}},
{{TensorDescCreatorTypes::nspc, precision}},
// use ncsp as default for non-quantized networks and nspc for quantized
auto firstCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::nspc : TensorDescCreatorTypes::ncsp;
auto secondCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::ncsp : TensorDescCreatorTypes::nspc;
addSupportedPrimDesc({{firstCreatorType, precision}},
{{firstCreatorType, precision}},
impl_type, supportDynamicBatch_);
addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}},
{{TensorDescCreatorTypes::ncsp, precision}},
addSupportedPrimDesc({{secondCreatorType, precision}},
{{secondCreatorType, precision}},
impl_type, supportDynamicBatch_);
// canUseBlocked
if (axis_ != 1) {

View File

@ -0,0 +1,214 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "ngraph_functions/builders.hpp"
#include "test_utils/cpu_test_utils.hpp"
using namespace InferenceEngine;
using namespace CPUTestUtils;
namespace CPULayerTestsDefinitions {
typedef std::tuple<
size_t, // Concat axis
std::vector<std::vector<size_t>>, // Input shapes
InferenceEngine::Precision, // Network precision
std::string, // Device name
CPUSpecificParams
> concatCPUTestParams;
class ConcatLayerCPUTest : public testing::WithParamInterface<concatCPUTestParams>,
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
public:
static std::string getTestCaseName(testing::TestParamInfo<concatCPUTestParams> obj) {
int axis;
std::vector<std::vector<size_t>> inputShapes;
InferenceEngine::Precision netPrecision;
std::string targetName;
CPUSpecificParams cpuParams;
std::tie(axis, inputShapes, netPrecision, targetName, cpuParams) = obj.param;
std::ostringstream result;
result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
result << "axis=" << axis << "_";
result << "netPRC=" << netPrecision.name() << "_";
result << "trgDev=" << targetName << "_";
result << CPUTestsBase::getTestCaseName(cpuParams);
return result.str();
}
protected:
void SetUp() override {
int axis;
std::vector<std::vector<size_t>> inputShape;
InferenceEngine::Precision netPrecision;
CPUSpecificParams cpuParams;
std::tie(axis, inputShape, netPrecision, targetDevice, cpuParams) = this->GetParam();
inPrc = outPrc = netPrecision;
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
selectedType += std::string("_") + inPrc.name();
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto params = ngraph::builder::makeParams(ngPrc, inputShape);
auto paramOuts = ngraph::helpers::convert2OutputVector(
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
auto concat = std::make_shared<ngraph::opset1::Concat>(paramOuts, axis);
function = makeNgraphFunction(ngPrc, params, concat, "concat");
}
};
TEST_P(ConcatLayerCPUTest, CompareWithRefs) {
SKIP_IF_CURRENT_TEST_IS_DISABLED()
Run();
CheckPluginRelatedResults(executableNetwork, "Concatenation");
}
namespace {
const auto planar_4D_ref = CPUSpecificParams{{nchw}, {nchw}, {"ref"}, "ref"};
const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"};
const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"};
const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"};
const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};
const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"};
const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"};
const auto blocked8_4D_ref = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "ref"};
const auto blocked8_5D_ref = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "ref"};
const auto blocked16_4D = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "unknown"};
const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown"};
const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"};
const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"};
// List of precisions natively supported by mkldnn.
const std::vector<Precision> netPrecisions = {
Precision::I8,
Precision::I32,
Precision::FP32,
Precision::BF16
};
INSTANTIATE_TEST_CASE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{1, 8, 3, 5},
{1, 16, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D, planarChannels_4D, blocked8_4D)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block8, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
{2, 16, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
{2, 32, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_4D)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3),
::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5},
{2, 32, 3, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_4D_ref)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{1, 8, 3, 5, 7},
{1, 16, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D, planarChannels_5D, blocked8_5D)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block8, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3, 4),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
{2, 16, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
{2, 32, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_5D)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2, 3, 4),
::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5, 7},
{2, 32, 3, 5, 7}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(blocked16_5D_ref)),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat_inPlace, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(1),
::testing::Values(std::vector<std::vector<size_t>>{{2, 3, 5},
{2, 4, 5}},
std::vector<std::vector<size_t>>{{2, 3},
{2, 4}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat3D, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0, 2),
::testing::Values(std::vector<std::vector<size_t>>{{2, 4, 5},
{2, 4, 5}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
ConcatLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Concat_1D_2D, ConcatLayerCPUTest,
::testing::Combine(
::testing::Values(0),
::testing::Values(std::vector<std::vector<size_t>>{{2, 4},
{3, 4}},
std::vector<std::vector<size_t>>{{2}, {3}}),
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
ConcatLayerCPUTest::getTestCaseName);
} // namespace
} // namespace CPULayerTestsDefinitions

View File

@ -222,6 +222,7 @@ void FuseTransposeAndReorderTest2::CreateGraph() {
transpose2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});
auto concat = ngraph::builder::makeConcat({transpose1, transpose2}, 1);
concat->get_rt_info() = makeCPUInfo({memFmt1, memFmt1}, {memFmt1}, {});
ngraph::ResultVector results{std::make_shared<ngraph::opset5::Result>(concat)};
function = std::make_shared<ngraph::Function>(results, params, "Transpose_Transpose_Concat");