[CPU] Fixed Concat node performance (#14216)
This commit is contained in:
parent
eb9946bd25
commit
ead91d7c2f
@ -340,8 +340,10 @@ inline bool parallel_it_step() {
|
||||
template <typename Q, typename R, typename... Args>
|
||||
inline bool parallel_it_step(Q& x, const R& X, Args&&... tuple) {
|
||||
if (parallel_it_step(static_cast<Args>(tuple)...)) {
|
||||
x = (x + 1) % X;
|
||||
return x == 0;
|
||||
if (++x - X == 0) {
|
||||
x = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -57,7 +57,6 @@ Concat::Concat(const std::shared_ptr<ngraph::Node>& op, const dnnl::engine& eng,
|
||||
}
|
||||
|
||||
const auto inRank = getInputShapeAtPort(0).getRank();
|
||||
canExecRef = inRank <= 6;
|
||||
auto concatOp = ngraph::as_type_ptr<ngraph::op::v0::Concat>(op);
|
||||
auto axis = concatOp->get_axis();
|
||||
if (axis < 0) {
|
||||
@ -94,9 +93,6 @@ void Concat::getSupportedDescriptors() {
|
||||
if (std::all_of(childDims.begin(), childDims.begin() + axis, [](size_t dim) { return dim == 1; }))
|
||||
canBeInPlace = true;
|
||||
}
|
||||
nelemToCopy.resize(getParentEdges().size(), 0);
|
||||
dstOffset.resize(getParentEdges().size());
|
||||
inputStrides.resize(getParentEdges().size());
|
||||
}
|
||||
|
||||
void Concat::initSupportedPrimitiveDescriptors() {
|
||||
@ -379,6 +375,12 @@ void Concat::prepareParams() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const auto& outputShape = dstMemDesc->getBlockDims();
|
||||
for (size_t i = 0; i < reorderedAxis; i++) {
|
||||
if (outputShape[i] != 1) {
|
||||
hasOuterLoop = true;
|
||||
}
|
||||
}
|
||||
std::vector<memory::desc> srcs_d;
|
||||
for (size_t i = 0; i < getParentEdges().size(); i++) {
|
||||
const auto& srcMemPtr = getParentEdgesAtPort(i)[0]->getMemoryPtr();
|
||||
@ -391,7 +393,11 @@ void Concat::prepareParams() {
|
||||
if (canExecRef) {
|
||||
const auto srcMemDesc = srcMemPtr->getDescPtr()->as<BlockedMemoryDesc>();
|
||||
const auto& inputShape = srcMemDesc->getBlockDims();
|
||||
inputStrides[i] = srcMemDesc->getStrides();
|
||||
const auto& strides = srcMemDesc->getStrides();
|
||||
inputStrides[i].resize(MAX_RANK_REF, 0);
|
||||
std::transform(strides.begin(), strides.end(), inputStrides[i].begin(), [&elemSize](const Dim& i) {
|
||||
return i * elemSize;
|
||||
});
|
||||
size_t nElem = 1;
|
||||
for (size_t j = reorderedAxis; j < inputShape.size(); j++) {
|
||||
nElem *= inputShape[j];
|
||||
@ -510,7 +516,18 @@ void Concat::initOptimalPrimitiveDescriptor() {
|
||||
}
|
||||
initDescriptor(config);
|
||||
}
|
||||
|
||||
//block layout may have axis greater than rank, disable ref_concat
|
||||
auto primDesc = getSelectedPrimitiveDescriptor();
|
||||
auto memDesc = primDesc->getConfig().outConfs[0].getMemDesc()->as<BlockedMemoryDesc>();
|
||||
auto rank = memDesc->getShape().getRank();
|
||||
bool isBlocked = rank != memDesc->getBlockDims().size();
|
||||
if (!isBlocked && rank <= MAX_RANK_REF) {
|
||||
canExecRef = true;
|
||||
nelemToCopy.resize(getParentEdges().size(), 0);
|
||||
dstOffset.resize(getParentEdges().size());
|
||||
inputStrides.resize(getParentEdges().size());
|
||||
srcPtrs.resize(getParentEdges().size());
|
||||
}
|
||||
// check if selected Tensor descriptor has nspc layout and concat axis is C
|
||||
canOptimizeNspc = axis == channelAxis && getSelectedPrimitiveDescriptor()->getConfig().outConfs.front().getMemDesc()->hasLayoutType(LayoutType::nspc);
|
||||
}
|
||||
@ -592,7 +609,6 @@ void Concat::execNspcSpecCase() {
|
||||
|
||||
void Concat::execRef() {
|
||||
const size_t numSrc = getParentEdges().size();
|
||||
std::vector<const uint8_t*> srcPtrs;
|
||||
const Memory& dstMemory = getChildEdgeAt(0)->getMemory();
|
||||
const size_t elemSize = DnnlExtensionUtils::sizeOfDataType(dstMemory.GetDataType());
|
||||
const auto dstMemBlkDesc = dstMemory.getDescPtr()->as<BlockedMemoryDesc>();
|
||||
@ -600,15 +616,13 @@ void Concat::execRef() {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dstMemory.GetData());
|
||||
for (size_t i = 0; i < numSrc; i++) {
|
||||
const Memory& srcMem = getParentEdgesAtPort(i)[0]->getMemory();
|
||||
srcPtrs.push_back(reinterpret_cast<const uint8_t*>(srcMem.GetPtr()));
|
||||
}
|
||||
const auto& outputStrides = dstMemBlkDesc->getStrides();
|
||||
bool hasOuterLoop = false;
|
||||
for (size_t i = 0; i < reorderedAxis; i++) {
|
||||
if (outputShape[i] != 1) {
|
||||
hasOuterLoop = true;
|
||||
}
|
||||
srcPtrs[i] = reinterpret_cast<const uint8_t*>(srcMem.GetPtr());
|
||||
}
|
||||
size_t outputStrides[MAX_RANK_REF] = {0};
|
||||
const auto strides = dstMemBlkDesc->getStrides();
|
||||
std::transform(strides.begin(), strides.end(), outputStrides, [&elemSize](const Dim& i) {
|
||||
return i * elemSize;
|
||||
});
|
||||
if (!hasOuterLoop) {
|
||||
int nthr = parallel_get_max_threads();
|
||||
if (nthr == 1) {
|
||||
@ -644,8 +658,8 @@ void Concat::execRef() {
|
||||
+ inputStrides[a][3] * n3 + inputStrides[a][4] * n4;
|
||||
size_t outOff = outputStrides[0] * n0 + outputStrides[1] * n1 + outputStrides[2] * n2
|
||||
+ outputStrides[3] * n3 + outputStrides[4] * n4;
|
||||
const uint8_t *i = srcPtrs[a] + inOff * elemSize;
|
||||
uint8_t *o = dstPtr + dstOffset[a] + outOff * elemSize;
|
||||
const uint8_t *i = &srcPtrs[a][inOff];
|
||||
uint8_t *o = &dstPtr[dstOffset[a] + outOff];
|
||||
|
||||
#if defined(__GNUC__)
|
||||
// Heuristic:
|
||||
|
@ -45,9 +45,12 @@ private:
|
||||
std::vector<VectorDims> inputStrides;
|
||||
std::vector<size_t> nelemToCopy; // byte moved in each iter
|
||||
std::vector<size_t> dstOffset; // dst offset for each input
|
||||
std::vector<const uint8_t*> srcPtrs;
|
||||
bool hasOuterLoop = false;
|
||||
InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
|
||||
InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
|
||||
bool canExecRef = false;
|
||||
static constexpr size_t MAX_RANK_REF = 6;
|
||||
};
|
||||
|
||||
} // namespace node
|
||||
|
Loading…
Reference in New Issue
Block a user