[CPU] Extend Concat node logic to avoid fallback on slow ref implementation. (#4129)
This commit is contained in:
parent
315c8d4eec
commit
7fb9bac24a
@ -134,6 +134,16 @@ PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &d
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
PartialBlkDesc PartialBlkDesc::makeTailC(const InferenceEngine::SizeVector &dims) {
|
||||
PartialBlkDesc res = makePlain(dims);
|
||||
if (dims.size() > 2) {
|
||||
auto itr = res.outer_order.begin() + 1;
|
||||
std::rotate(itr, itr + 1, res.outer_order.end());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
|
||||
if (desc.getLayout() == InferenceEngine::ANY)
|
||||
IE_THROW() << "Cannot extract partial blocked descriptor for `ANY` layout";
|
||||
|
@ -59,6 +59,9 @@ public:
|
||||
/** Construct blocked Channel PartialBlkDesc based on dims information */
|
||||
static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);
|
||||
|
||||
/** Construct per Channel PartialBlkDesc based on dims information */
|
||||
static PartialBlkDesc makeTailC(const InferenceEngine::SizeVector &dims);
|
||||
|
||||
/** Compare operators. Allow to use it as key for std::map */
|
||||
bool operator == (const PartialBlkDesc& it) const;
|
||||
bool operator < (const PartialBlkDesc& it) const;
|
||||
|
@ -21,11 +21,15 @@
|
||||
#include "mkldnn_eltwise_node.h"
|
||||
#include <limits>
|
||||
#include "common/cpu_memcpy.h"
|
||||
#include "common/tensor_desc_creator.h"
|
||||
|
||||
using namespace mkldnn;
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace {
|
||||
constexpr size_t channelAxis = 1lu;
|
||||
}
|
||||
|
||||
bool MKLDNNConcatNode::isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept {
|
||||
try {
|
||||
@ -89,308 +93,120 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
}
|
||||
|
||||
// MKLDNN doesn't support different precision on inputs so fallback on FP32 in such case
|
||||
// Concat doesn't support different precision on inputs so fallback on FP32 in such case
|
||||
if (isMixedPrecision)
|
||||
inputPrecision = Precision::FP32;
|
||||
|
||||
// Concat node supports int8 implementations only for NHWC and NDHWC layouts
|
||||
if (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) {
|
||||
int ndims = getChildEdgeAt(0)->getDims().ndims();
|
||||
if (ndims != 2 && ndims != 4 && ndims != 5)
|
||||
inputPrecision = Precision::FP32;
|
||||
}
|
||||
|
||||
// MKLDNN supports only equal precisions for inputs and output
|
||||
// Concat supports only equal precisions for inputs and output
|
||||
outputPrecision = inputPrecision;
|
||||
|
||||
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
|
||||
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
|
||||
auto& dstDims = getChildEdgeAt(0)->getDims();
|
||||
std::vector<TensorDescCreatorTypes> tdCreatorTypes = {TensorDescCreatorTypes::ncsp, TensorDescCreatorTypes::nspc};
|
||||
|
||||
MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
|
||||
InferenceEngine::LayerConfig config;
|
||||
config.dynBatchSupport = true;
|
||||
// check if blocked layouts are available the channels size should be evenly divided by the block size to avoid slow oneDNN ref implementation
|
||||
if (dstDims.ndims() > channelAxis) {
|
||||
for (auto item : { std::make_pair(8lu, TensorDescCreatorTypes::nCsp8c), std::make_pair(16lu, TensorDescCreatorTypes::nCsp16c)}) {
|
||||
SizeVector blkDims = dstDims.ToSizeVector();
|
||||
if (blkDims[channelAxis] % item.first)
|
||||
continue;
|
||||
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
auto parentEdge = getParentEdgeAt(i);
|
||||
|
||||
InferenceEngine::DataConfig dataConfig;
|
||||
dataConfig.inPlace = -1;
|
||||
dataConfig.constant = false;
|
||||
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format_tag::nc :
|
||||
parentEdge->getDims().ndims() == 4 ? memory::format_tag::nhwc :
|
||||
memory::format_tag::ndhwc
|
||||
: memory::format_tag::any;
|
||||
|
||||
dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, fmt));
|
||||
config.inConfs.push_back(dataConfig);
|
||||
}
|
||||
|
||||
auto dims = getChildEdgeAt(0)->getDims();
|
||||
|
||||
config.outConfs.resize(1);
|
||||
config.outConfs[0].inPlace = -1;
|
||||
config.outConfs[0].constant = false;
|
||||
if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
|
||||
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format_tag::nc :
|
||||
dims.ndims() == 4 ? memory::format_tag::nhwc :
|
||||
memory::format_tag::ndhwc
|
||||
: MKLDNNMemory::GetPlainFormat(dims);
|
||||
|
||||
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, fmt));
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, fmt);
|
||||
|
||||
if (inputPrecision != Precision::U8 && inputPrecision != Precision::I8) {
|
||||
if (dims.ndims() == 4) {
|
||||
if (dims[1] % 8 == 0) {
|
||||
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
|
||||
MKLDNNMemoryDesc(dims, outputDataType, memory::format_tag::nChw8c));
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, memory::format_tag::nChw8c);
|
||||
|
||||
if (dims[1] % 16 == 0) {
|
||||
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
|
||||
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nChw16c));
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nChw16c);
|
||||
}
|
||||
}
|
||||
} else if (dims.ndims() == 5) {
|
||||
if (dims[1] % 8 == 0) {
|
||||
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
|
||||
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw8c));
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw8c);
|
||||
|
||||
if (dims[1] % 16 == 0) {
|
||||
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
|
||||
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw16c));
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw16c);
|
||||
}
|
||||
bool blocked = true;
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
auto& srcDims = getParentEdgeAt(i)->getDims();
|
||||
if (srcDims[channelAxis] % item.first) {
|
||||
blocked = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (blocked) {
|
||||
tdCreatorTypes.push_back(item.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (axis != 1)
|
||||
std::vector<size_t> pdIndexesToReuse;
|
||||
|
||||
auto& creatorsMap = TensorDescCreator::getCommonCreators();
|
||||
auto itrRange = TensorDescCreator::makeFilteredRange(creatorsMap, static_cast<unsigned>(dstDims.ndims()), tdCreatorTypes);
|
||||
for (auto itr = itrRange.first; itr != itrRange.second; ++itr) {
|
||||
InferenceEngine::LayerConfig config;
|
||||
|
||||
config.dynBatchSupport = true;
|
||||
config.outConfs.resize(1);
|
||||
config.outConfs[0].inPlace = -1;
|
||||
config.outConfs[0].constant = false;
|
||||
config.outConfs[0].desc = itr->second->createDesc(outputPrecision, dstDims.ToSizeVector());
|
||||
memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat();
|
||||
|
||||
config.inConfs.resize(getParentEdges().size());
|
||||
|
||||
for (size_t i = 0; i < getParentEdges().size(); ++i) {
|
||||
config.inConfs[i].inPlace = -1;
|
||||
config.inConfs[i].constant = false;
|
||||
config.inConfs[i].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
|
||||
itr->second->createDesc(inputPrecision, getParentEdgeAt(i)->getDims().ToSizeVector()));
|
||||
}
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, outFmt);
|
||||
if (itr->first != TensorDescCreatorTypes::nspc) {
|
||||
pdIndexesToReuse.push_back(supportedPrimitiveDescriptors.size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (axis != channelAxis)
|
||||
return;
|
||||
|
||||
auto numOfDim = static_cast<size_t>(dstDims.ndims());
|
||||
// Optimized inplace case
|
||||
|
||||
SizeVector order(numOfDim);
|
||||
SizeVector offsets(numOfDim, 0lu);
|
||||
size_t offset = (std::numeric_limits<size_t>::max)();
|
||||
for (size_t i = 0; i < numOfDim; i++) {
|
||||
order[i] = i;
|
||||
}
|
||||
for (auto refPdIndex : pdIndexesToReuse) {
|
||||
const auto& refConfig = supportedPrimitiveDescriptors[refPdIndex].getConfig();
|
||||
auto config = refConfig;
|
||||
|
||||
if (outputPrecision == Precision::I8 || outputPrecision == Precision::U8) {
|
||||
if (numOfDim == 4) {
|
||||
// Here we assume NHWC layout (channels are the last)
|
||||
const auto& order = refConfig.outConfs[0].desc.getBlockingDesc().getOrder();
|
||||
const auto& blkDims = refConfig.outConfs[0].desc.getBlockingDesc().getBlockDims();
|
||||
auto numOfDim = blkDims.size();
|
||||
|
||||
order = {0, 2, 3, 1};
|
||||
offsets = {0, 0, 0, 0};
|
||||
SizeVector offsets(numOfDim, 0lu);
|
||||
SizeVector strides(numOfDim);
|
||||
strides.back() = 1lu;
|
||||
size_t offset = (std::numeric_limits<size_t>::max)();
|
||||
|
||||
SizeVector blkDims = dstDims.ToSizeVector();
|
||||
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
|
||||
|
||||
SizeVector strides(numOfDim);
|
||||
strides.resize(numOfDim);
|
||||
// C is the last in NHWC, so all strides are max()
|
||||
for (size_t i = 0; i < numOfDim; i++) {
|
||||
strides[i] = (std::numeric_limits<size_t>::max)();
|
||||
}
|
||||
|
||||
config.outConfs[0].desc = TensorDesc(outputPrecision,
|
||||
dstDims.ToSizeVector(),
|
||||
{ blkDims, order, offset, offsets, strides });
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
auto parentEdge = getParentEdgeAt(i);
|
||||
|
||||
SizeVector blkDims = parentEdge->getDims().ToSizeVector();
|
||||
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[1] };
|
||||
|
||||
config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NHWC in mkldnn
|
||||
|
||||
config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(),
|
||||
{blkDims, order, offset, offsets, strides});
|
||||
}
|
||||
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nhwc);
|
||||
|
||||
return;
|
||||
} else if (numOfDim == 5) {
|
||||
// Here we assume NDHWC layout (channels are the last)
|
||||
|
||||
order = {0, 2, 3, 4, 1};
|
||||
offsets = {0, 0, 0, 0, 0};
|
||||
|
||||
SizeVector blkDims = dstDims.ToSizeVector();
|
||||
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] };
|
||||
|
||||
SizeVector strides(numOfDim);
|
||||
strides.resize(numOfDim);
|
||||
// C is the last in NDHWC, so all strides are max()
|
||||
for (size_t i = 0; i < numOfDim; i++) {
|
||||
strides[i] = (std::numeric_limits<size_t>::max)();
|
||||
}
|
||||
|
||||
config.outConfs[0].desc = TensorDesc(outputPrecision,
|
||||
dstDims.ToSizeVector(),
|
||||
{ blkDims, order, offset, offsets, strides });
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
auto parentEdge = getParentEdgeAt(i);
|
||||
|
||||
SizeVector blkDims = parentEdge->getDims().ToSizeVector();
|
||||
blkDims = { blkDims[0], blkDims[2], blkDims[3], blkDims[4], blkDims[1] };
|
||||
|
||||
config.inConfs[i].inPlace = -1; // Change to 0 here if inplace concat is supported for NDHWC in mkldnn
|
||||
|
||||
config.inConfs[i].desc = TensorDesc(inputPrecision, parentEdge->getDims().ToSizeVector(),
|
||||
{blkDims, order, offset, offsets, strides});
|
||||
}
|
||||
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::ndhwc);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
SizeVector strides(numOfDim);
|
||||
strides[numOfDim - 1] = 1;
|
||||
for (size_t i = 2; i <= numOfDim; i++) {
|
||||
if (numOfDim - i < axis) {
|
||||
strides[numOfDim - i] = (std::numeric_limits<size_t>::max)();
|
||||
} else {
|
||||
strides[numOfDim - i] = strides[numOfDim - i + 1] * dstDims[numOfDim - i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
config.outConfs[0].desc = TensorDesc(
|
||||
MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
|
||||
dstDims.ToSizeVector(),
|
||||
{dstDims.ToSizeVector(), order, offset, offsets, strides});
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
auto parentEdge = getParentEdgeAt(i);
|
||||
config.inConfs[i].inPlace = 0;
|
||||
config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
|
||||
{parentEdge->getDims().ToSizeVector(), order, offset, offsets, strides});
|
||||
}
|
||||
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, MKLDNNMemory::Convert(config.outConfs[0].desc.getLayout()));
|
||||
|
||||
if (numOfDim == 4lu || numOfDim == 5lu) {
|
||||
size_t blkDimsLen = numOfDim + 1;
|
||||
order.resize(blkDimsLen);
|
||||
for (size_t i = 0; i < numOfDim; i++) {
|
||||
order[i] = i;
|
||||
}
|
||||
order[numOfDim] = 1lu;
|
||||
offsets = SizeVector(blkDimsLen, 0lu);
|
||||
|
||||
// nChw8c, nChw16c, nCdhw8c, nCdhw16c
|
||||
for (size_t sizeS : {8lu, 16lu}) {
|
||||
SizeVector blkDims = dstDims.ToSizeVector();
|
||||
if (blkDims[1] % sizeS)
|
||||
continue;
|
||||
blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
|
||||
blkDims.push_back(sizeS);
|
||||
|
||||
strides.resize(blkDimsLen);
|
||||
strides[blkDimsLen - 1] = 1;
|
||||
for (size_t i = 2lu; i <= blkDimsLen; i++) {
|
||||
if (blkDimsLen - i < axis) {
|
||||
strides[blkDimsLen - i] = (std::numeric_limits<size_t>::max)();
|
||||
} else {
|
||||
strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
|
||||
}
|
||||
}
|
||||
config.outConfs[0].desc = TensorDesc(
|
||||
MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType),
|
||||
dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
|
||||
|
||||
bool canInplace = true;
|
||||
for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
|
||||
auto parentEdge = getParentEdgeAt(i);
|
||||
blkDims = parentEdge->getDims().ToSizeVector();
|
||||
if (blkDims[1] % sizeS)
|
||||
canInplace = false;
|
||||
|
||||
blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
|
||||
blkDims.push_back(sizeS);
|
||||
config.inConfs[i].desc = TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
|
||||
{blkDims, order, offset, offsets, strides});
|
||||
}
|
||||
if (canInplace) {
|
||||
auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::nChw16c
|
||||
: sizeS == 8lu ? mkldnn::memory::format_tag::nCdhw8c : mkldnn::memory::format_tag::nCdhw16c;
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, dstFormat);
|
||||
for (size_t i = 2; i <= numOfDim; i++) {
|
||||
if (numOfDim - i < axis) {
|
||||
strides[numOfDim - i] = (std::numeric_limits<size_t>::max)();
|
||||
} else {
|
||||
strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
config.outConfs[0].desc = TensorDesc(outputPrecision, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
|
||||
memory::format_tag outFmt = MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat();
|
||||
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
const auto& srcBlkDims = refConfig.inConfs[i].desc.getBlockingDesc().getBlockDims();
|
||||
const auto& dims = refConfig.inConfs[i].desc.getDims();
|
||||
|
||||
config.inConfs[i].inPlace = 0;
|
||||
config.inConfs[i].desc = TensorDesc(inputPrecision, dims, {srcBlkDims, order, offset, offsets, strides});
|
||||
}
|
||||
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFmt);
|
||||
}
|
||||
}
|
||||
|
||||
void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
|
||||
bool hasUnknown = false;
|
||||
std::vector<size_t> canSelectPrimitive;
|
||||
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
|
||||
bool hasAny = true;
|
||||
auto &primDescInfo = supportedPrimitiveDescriptors[i];
|
||||
if (primDescInfo.getImplementationType() != impl_desc_type::unknown ||
|
||||
primDescInfo.getConfig().inConfs[0].inPlace < 0)
|
||||
continue;
|
||||
hasUnknown = true;
|
||||
for (auto iInfo : primDescInfo.getConfig().inConfs) {
|
||||
if (iInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
|
||||
hasAny = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (hasAny) {
|
||||
for (auto oInfo : primDescInfo.getConfig().outConfs) {
|
||||
if (oInfo.desc.getLayout() != InferenceEngine::Layout::ANY) {
|
||||
hasAny = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasAny) {
|
||||
canSelectPrimitive.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
bool hasDoubleConnection = false;
|
||||
for (int i = 0; i < getParentEdges().size(); i++) {
|
||||
for (int j = i + 1; j < getParentEdges().size(); j++) {
|
||||
if (getParentEdgeAt(i) == getParentEdgeAt(j)) hasDoubleConnection = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (hasDoubleConnection) {
|
||||
// The double connection marks that some tensor should
|
||||
// be replicated. Inplace approach is not applicable
|
||||
// for that case. Descriptor with index 0 is pure copy
|
||||
// implementation
|
||||
selectPrimitiveDescriptorByIndex(0);
|
||||
return;
|
||||
}
|
||||
|
||||
bool canOptimize = true;
|
||||
for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
|
||||
const auto& parent = getParentEdgeAt(i)->getParent();
|
||||
for (size_t j = 0; canOptimize && j < parent->getChildEdges().size(); j++) {
|
||||
const auto& child = parent->getChildEdgeAt(j)->getChild();
|
||||
const auto* childConcat = dynamic_cast<MKLDNNConcatNode *>(child.get());
|
||||
if (!childConcat || childConcat == this)
|
||||
continue;
|
||||
if (childConcat->isOptimized())
|
||||
canOptimize = false;
|
||||
|
||||
// The double connection marks that some tensor should
|
||||
// be replicated. Inplace approach is not applicable
|
||||
// for that case.
|
||||
for (int i = 0; i < getParentEdges().size(); i++) {
|
||||
for (int j = i + 1; j < getParentEdges().size(); j++) {
|
||||
if (getParentEdgeAt(i) == getParentEdgeAt(j)) canOptimize = false;
|
||||
}
|
||||
}
|
||||
if (hasUnknown && axis == 1) {
|
||||
if (canSelectPrimitive.size() == 1) {
|
||||
selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
|
||||
if (axis != channelAxis) {
|
||||
canOptimize = false;
|
||||
}
|
||||
|
||||
@ -432,44 +248,57 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
|
||||
}
|
||||
|
||||
size_t maxCount = 0;
|
||||
auto convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
|
||||
auto outDims = getChildEdgeAt(0)->getDims().ToSizeVector();
|
||||
auto convertTo = PartialBlkDesc::makePlain(outDims);
|
||||
for (auto &it : formatFrequency) {
|
||||
if (it.second > maxCount) {
|
||||
maxCount = it.second;
|
||||
convertTo = it.first;
|
||||
} else if (it.second == maxCount) {
|
||||
if (isInQuantizedGraph && it.first == PartialBlkDesc::makeTailC(outDims)) {
|
||||
convertTo = it.first;
|
||||
} else if (it.first == PartialBlkDesc::makeCBlocked(outDims, 8) || it.first == PartialBlkDesc::makeCBlocked(outDims, 16)) {
|
||||
convertTo = it.first;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (canOptimize && convertTo.isAutoExtendedWith(getChildEdgeAt(0)->getDims().ToSizeVector()))
|
||||
convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
|
||||
for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
|
||||
if (convertTo.isAutoExtendedWith(outDims))
|
||||
convertTo = PartialBlkDesc::makePlain(outDims);
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
if (convertTo.isAutoExtendedWith(getParentEdgeAt(i)->getDims().ToSizeVector()))
|
||||
convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
|
||||
convertTo = PartialBlkDesc::makePlain(outDims);
|
||||
}
|
||||
|
||||
for (auto supportedPdIndex : canSelectPrimitive) {
|
||||
if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc) == convertTo) {
|
||||
selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
|
||||
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); ++i) {
|
||||
if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc) == convertTo) {
|
||||
if (IMPLICATION(supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown, canOptimize)) {
|
||||
canSelectPrimitive.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (canSelectPrimitive.size() == 1) {
|
||||
selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive[0]));
|
||||
return;
|
||||
}
|
||||
|
||||
// if there are more then one PD with similar data layouts - select the optimized one
|
||||
for (auto indx : canSelectPrimitive) {
|
||||
if (supportedPrimitiveDescriptors[indx].getImplementationType() == impl_desc_type::unknown) {
|
||||
selectPrimitiveDescriptorByIndex(static_cast<int>(indx));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// if there are no matching data layouts, select first optimized implementation
|
||||
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
|
||||
auto &primDescInfo = supportedPrimitiveDescriptors[i];
|
||||
if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
|
||||
continue;
|
||||
if (convertTo == PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc)) {
|
||||
size_t num = 0;
|
||||
for (num = 0; num < getParentEdges().size(); num++) {
|
||||
if (convertTo.isAutoExtendedWith(getParentEdgeAt(num)->getDims().ToSizeVector()))
|
||||
break;
|
||||
}
|
||||
if (num == getParentEdges().size()) {
|
||||
selectPrimitiveDescriptorByIndex(i);
|
||||
return;
|
||||
}
|
||||
if (canOptimize && supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown) {
|
||||
selectPrimitiveDescriptorByIndex(static_cast<int>(i));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
selectPrimitiveDescriptorByIndex(0);
|
||||
}
|
||||
|
||||
@ -491,6 +320,12 @@ void MKLDNNConcatNode::createPrimitive() {
|
||||
if (getSelectedPrimitiveDescriptor() == nullptr)
|
||||
IE_THROW() << "Preferable primitive descriptor is not set.";
|
||||
|
||||
//check if selected Tensor descriptor has nspc layout and concat axis is C
|
||||
if (axis == channelAxis && getChildEdgeAt(0)->getMemory().GetDesc().isTailCFormat()) {
|
||||
canOptimizeNspc = true;
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<memory::desc> srcs_d;
|
||||
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
@ -540,7 +375,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
|
||||
if (!isInitConfig(config)) {
|
||||
for (size_t i = 0; i < config.inConfs.size(); i++) {
|
||||
config.inConfs[i].desc = getConfiguredInputDesc(config, i);
|
||||
// MKLDNN doesn't support different precision on inputs
|
||||
// Concat doesn't support different precision on inputs
|
||||
config.inConfs[i].desc.setPrecision(inputPrecision);
|
||||
}
|
||||
|
||||
@ -560,8 +395,7 @@ void MKLDNNConcatNode::initOptimalPrimitiveDescriptor() {
|
||||
return;
|
||||
|
||||
for (size_t i = 0; i < config.outConfs.size(); i++) {
|
||||
if (config.outConfs[i].desc.getLayout() == InferenceEngine::Layout::ANY ||
|
||||
!isUninitTensorDesc(config.outConfs[i].desc))
|
||||
if (!isUninitTensorDesc(config.outConfs[i].desc))
|
||||
continue;
|
||||
|
||||
int num = getChildEdgeAt(i)->getOutputNum();
|
||||
@ -621,49 +455,53 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {
|
||||
return;
|
||||
}
|
||||
|
||||
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
|
||||
const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
|
||||
const size_t num_src = getParentEdges().size();
|
||||
|
||||
const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
|
||||
|
||||
if (isInt8) {
|
||||
uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
|
||||
|
||||
std::vector<size_t> channels;
|
||||
size_t channels_size = 0;
|
||||
std::vector<const uint8_t*> src_ptrs;
|
||||
std::vector<uint8_t*> dst_ptrs;
|
||||
|
||||
for (size_t i = 0; i < num_src; i++) {
|
||||
const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
|
||||
const size_t num_channels = src_mem.GetDims()[1];
|
||||
|
||||
channels.push_back(num_channels);
|
||||
src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
|
||||
dst_ptrs.push_back(dst_ptr + channels_size);
|
||||
channels_size += num_channels;
|
||||
}
|
||||
|
||||
const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channels[0];
|
||||
|
||||
parallel_for(iter_count, [&](int i) {
|
||||
const size_t dst_off = i * channels_size;
|
||||
for (int j = 0; j < num_src; j++) {
|
||||
cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channels[j], channels[j]);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
|
||||
for (int i = 0; i < num_src; i++)
|
||||
mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
|
||||
|
||||
(*prim).execute(strm, mem_ags);
|
||||
if (canOptimizeNspc) {
|
||||
execNspcSpecCase();
|
||||
return;
|
||||
}
|
||||
|
||||
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
|
||||
const size_t num_src = getParentEdges().size();
|
||||
std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
|
||||
for (int i = 0; i < num_src; i++)
|
||||
mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
|
||||
|
||||
(*prim).execute(strm, mem_ags);
|
||||
}
|
||||
|
||||
InferenceEngine::Precision MKLDNNConcatNode::getRuntimePrecision() const {
|
||||
return MKLDNNExtensionUtils::getMaxPrecision(getInputPrecisions());
|
||||
}
|
||||
|
||||
void MKLDNNConcatNode::execNspcSpecCase() {
|
||||
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
|
||||
const size_t num_src = getParentEdges().size();
|
||||
uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
|
||||
const size_t dataSize = MKLDNNExtensionUtils::sizeOfDataType(dst_memory.GetDataType());
|
||||
|
||||
std::vector<size_t> channelsDataSize;
|
||||
size_t channels_size = 0;
|
||||
std::vector<const uint8_t*> src_ptrs;
|
||||
std::vector<uint8_t*> dst_ptrs;
|
||||
|
||||
for (size_t i = 0; i < num_src; i++) {
|
||||
const MKLDNNMemory& src_mem = getParentEdgeAt(i)->getMemory();
|
||||
const size_t num_channels = src_mem.GetDims()[channelAxis];
|
||||
|
||||
channelsDataSize.push_back(num_channels * dataSize);
|
||||
src_ptrs.push_back(reinterpret_cast<const uint8_t*>(src_mem.GetData()));
|
||||
dst_ptrs.push_back(dst_ptr + channels_size);
|
||||
channels_size += num_channels * dataSize;
|
||||
}
|
||||
|
||||
const size_t iter_count = getParentEdgeAt(0)->getMemory().GetSize() / channelsDataSize[0];
|
||||
|
||||
parallel_for(iter_count, [&](int i) {
|
||||
const size_t dst_off = i * channels_size;
|
||||
for (int j = 0; j < num_src; j++) {
|
||||
cpu_memcpy(dst_ptrs[j] + dst_off, src_ptrs[j] + i * channelsDataSize[j], channelsDataSize[j]);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
REG_MKLDNN_PRIM_FOR(MKLDNNConcatNode, Concatenation);
|
||||
|
@ -30,8 +30,10 @@ public:
|
||||
|
||||
private:
|
||||
size_t axis = 0;
|
||||
bool canOptimizeNspc = false;
|
||||
|
||||
size_t inverseOrder(const InferenceEngine::SizeVector& order, size_t axis);
|
||||
void execNspcSpecCase();
|
||||
|
||||
InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
|
||||
InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
|
||||
|
@ -94,11 +94,15 @@ void MKLDNNShuffleChannelsNode::initSupportedPrimitiveDescriptors() {
|
||||
impl_type = impl_desc_type::ref;
|
||||
}
|
||||
|
||||
addSupportedPrimDesc({{TensorDescCreatorTypes::nspc, precision}},
|
||||
{{TensorDescCreatorTypes::nspc, precision}},
|
||||
// use ncsp as default for non-quantized networks and nspc for quantized
|
||||
auto firstCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::nspc : TensorDescCreatorTypes::ncsp;
|
||||
auto secondCreatorType = isInQuantizedGraph ? TensorDescCreatorTypes::ncsp : TensorDescCreatorTypes::nspc;
|
||||
|
||||
addSupportedPrimDesc({{firstCreatorType, precision}},
|
||||
{{firstCreatorType, precision}},
|
||||
impl_type, supportDynamicBatch_);
|
||||
addSupportedPrimDesc({{TensorDescCreatorTypes::ncsp, precision}},
|
||||
{{TensorDescCreatorTypes::ncsp, precision}},
|
||||
addSupportedPrimDesc({{secondCreatorType, precision}},
|
||||
{{secondCreatorType, precision}},
|
||||
impl_type, supportDynamicBatch_);
|
||||
// canUseBlocked
|
||||
if (axis_ != 1) {
|
||||
|
@ -0,0 +1,214 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "test_utils/cpu_test_utils.hpp"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
using namespace CPUTestUtils;
|
||||
|
||||
namespace CPULayerTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
size_t, // Concat axis
|
||||
std::vector<std::vector<size_t>>, // Input shapes
|
||||
InferenceEngine::Precision, // Network precision
|
||||
std::string, // Device name
|
||||
CPUSpecificParams
|
||||
> concatCPUTestParams;
|
||||
|
||||
class ConcatLayerCPUTest : public testing::WithParamInterface<concatCPUTestParams>,
|
||||
virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<concatCPUTestParams> obj) {
|
||||
int axis;
|
||||
std::vector<std::vector<size_t>> inputShapes;
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::string targetName;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(axis, inputShapes, netPrecision, targetName, cpuParams) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
|
||||
result << "axis=" << axis << "_";
|
||||
result << "netPRC=" << netPrecision.name() << "_";
|
||||
result << "trgDev=" << targetName << "_";
|
||||
result << CPUTestsBase::getTestCaseName(cpuParams);
|
||||
return result.str();
|
||||
}
|
||||
protected:
|
||||
void SetUp() override {
|
||||
int axis;
|
||||
std::vector<std::vector<size_t>> inputShape;
|
||||
InferenceEngine::Precision netPrecision;
|
||||
CPUSpecificParams cpuParams;
|
||||
std::tie(axis, inputShape, netPrecision, targetDevice, cpuParams) = this->GetParam();
|
||||
inPrc = outPrc = netPrecision;
|
||||
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
|
||||
selectedType += std::string("_") + inPrc.name();
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto params = ngraph::builder::makeParams(ngPrc, inputShape);
|
||||
auto paramOuts = ngraph::helpers::convert2OutputVector(
|
||||
ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
|
||||
auto concat = std::make_shared<ngraph::opset1::Concat>(paramOuts, axis);
|
||||
|
||||
function = makeNgraphFunction(ngPrc, params, concat, "concat");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(ConcatLayerCPUTest, CompareWithRefs) {
|
||||
SKIP_IF_CURRENT_TEST_IS_DISABLED()
|
||||
|
||||
Run();
|
||||
CheckPluginRelatedResults(executableNetwork, "Concatenation");
|
||||
}
|
||||
|
||||
namespace {
|
||||
const auto planar_4D_ref = CPUSpecificParams{{nchw}, {nchw}, {"ref"}, "ref"};
|
||||
const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"};
|
||||
|
||||
const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"};
|
||||
const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"};
|
||||
|
||||
const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
|
||||
const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};
|
||||
|
||||
const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"};
|
||||
const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"};
|
||||
|
||||
const auto blocked8_4D_ref = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "ref"};
|
||||
const auto blocked8_5D_ref = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "ref"};
|
||||
|
||||
const auto blocked16_4D = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "unknown"};
|
||||
const auto blocked16_5D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "unknown"};
|
||||
|
||||
const auto blocked16_4D_ref = CPUSpecificParams{{nChw16c}, {nChw16c}, {}, "ref"};
|
||||
const auto blocked16_5D_ref = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {}, "ref"};
|
||||
|
||||
// List of precisions natively supported by mkldnn.
|
||||
const std::vector<Precision> netPrecisions = {
|
||||
Precision::I8,
|
||||
Precision::I32,
|
||||
Precision::FP32,
|
||||
Precision::BF16
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(concat_Concat4D_CPU_Block8inPlace, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(1),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{1, 8, 3, 5},
|
||||
{1, 16, 3, 5}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_4D, planarChannels_4D, blocked8_4D)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block8, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(0, 2, 3),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
|
||||
{2, 16, 3, 5}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_4D_ref, planarChannels_4D, blocked8_4D_ref)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16inPlace, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(1),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5},
|
||||
{2, 32, 3, 5}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_4D)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat4D_CPU_Block16, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(0, 2, 3),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5},
|
||||
{2, 32, 3, 5}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_4D_ref)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(concat_Concat5D_CPU_Block8inPlace, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(1),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{1, 8, 3, 5, 7},
|
||||
{1, 16, 3, 5, 7}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_5D, planarChannels_5D, blocked8_5D)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block8, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(0, 2, 3, 4),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
|
||||
{2, 16, 3, 5, 7}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(planar_5D_ref, planarChannels_5D, blocked8_5D_ref)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16inPlace, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(1),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 16, 3, 5, 7},
|
||||
{2, 32, 3, 5, 7}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_5D)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat5D_CPU_Block16, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(0, 2, 3, 4),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 32, 3, 5, 7},
|
||||
{2, 32, 3, 5, 7}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(blocked16_5D_ref)),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat_inPlace, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(1),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 3, 5},
|
||||
{2, 4, 5}},
|
||||
std::vector<std::vector<size_t>>{{2, 3},
|
||||
{2, 4}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(CPUSpecificParams{{}, {}, {}, "unknown"})),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat3D, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(0, 2),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 4, 5},
|
||||
{2, 4, 5}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_Concat_1D_2D, ConcatLayerCPUTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(0),
|
||||
::testing::Values(std::vector<std::vector<size_t>>{{2, 4},
|
||||
{3, 4}},
|
||||
std::vector<std::vector<size_t>>{{2}, {3}}),
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU),
|
||||
::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})),
|
||||
ConcatLayerCPUTest::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
} // namespace CPULayerTestsDefinitions
|
@ -222,6 +222,7 @@ void FuseTransposeAndReorderTest2::CreateGraph() {
|
||||
transpose2->get_rt_info() = makeCPUInfo({memFmt2}, {memFmt2}, {});
|
||||
|
||||
auto concat = ngraph::builder::makeConcat({transpose1, transpose2}, 1);
|
||||
concat->get_rt_info() = makeCPUInfo({memFmt1, memFmt1}, {memFmt1}, {});
|
||||
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset5::Result>(concat)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "Transpose_Transpose_Concat");
|
||||
|
Loading…
Reference in New Issue
Block a user