Files
openvino/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
2021-10-28 10:52:14 +03:00

1935 lines
76 KiB
C++

// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_graph_optimizer.h"
#include "mkldnn_extension_utils.h"
#include "nodes/mkldnn_reshape_node.h"
#include "nodes/mkldnn_pooling_node.h"
#include "nodes/mkldnn_eltwise_node.h"
#include "nodes/mkldnn_concat_node.h"
#include "nodes/mkldnn_reorder_node.h"
#include "nodes/mkldnn_conv_node.h"
#include "nodes/mkldnn_bin_conv_node.h"
#include "nodes/mkldnn_fake_quantize_node.h"
#include "nodes/mkldnn_mvn_node.h"
#include <nodes/mkldnn_transpose_node.h>
#include "nodes/mkldnn_interpolate_node.h"
#include "nodes/mkldnn_input_node.h"
#include "nodes/mkldnn_rnn.h"
#include "nodes/common/cpu_convert.h"
#include "mkldnn/ie_mkldnn.h"
#include <blob_factory.hpp>
#include "utils/general_utils.h"
#include "utils/cpu_utils.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ie_ngraph_utils.hpp>
// WA for xbyak.h
#ifdef _WIN32
# ifndef _WINSOCKAPI_
# define _WINSOCKAPI_
# endif
# ifndef _WINSOCK2API_
# define _WINSOCK2API_
#endif
#endif
#include <cpu/x64/cpu_isa_traits.hpp>
#include <string>
#include <list>
#include <memory>
#include <set>
#include <algorithm>
#include "mkldnn_itt.h"
#include "memory_desc/cpu_memory_desc_utils.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
MKLDNNGraphOptimizer::MKLDNNGraphOptimizer() {}
void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::MKLDNN_LT, "ApplyCommonGraphOptimizations", "FuseConvolutionAndBias");
FuseConvolutionAndBias(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseMultiplyAndAdd");
FuseMultiplyAndAdd(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseDeconvolutionAndSimpleOperation");
FuseDeconvolutionAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseBroadcastAndEltwise");
FuseBroadcastAndEltwise(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseClampAndFakeQuantize");
FuseClampAndFakeQuantize(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FusePerformedAsScaleShiftAndFakeQuantize");
FusePerformedAsScaleShiftAndFakeQuantize(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndZeroPoints");
FuseConvolutionAndZeroPoints(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndSimpleOperationThroughMaxPool");
FuseConvolutionAndSimpleOperationThroughMaxPool(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndSimpleOperation");
FuseConvolutionAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "RemoveDroppedEdges");
graph.SortTopologically();
graph.RemoveDroppedEdges();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FusePoolingAndFakeQuantize");
FusePoolingAndFakeQuantize(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "RemoveDroppedEdges");
graph.SortTopologically();
graph.RemoveDroppedEdges();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndDWConvolution");
FuseConvolutionAndDWConvolution(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionSumAndConvolutionSumActivation");
FuseConvolutionSumAndConvolutionSumActivation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndSimpleOperation");
FuseConvolutionAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFullyConnectedAndSimpleOperation");
FuseFullyConnectedAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseMatMulAndSimpleOperation");
FuseMatMulAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseMVNAndSimpleOperation");
FuseMVNAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseInterpolateAndSimpleOperation");
FuseInterpolateAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseNormalizeL2AndSimpleOperation");
FuseNormalizeL2AndSimpleOperation(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseEltwiseAndSimple");
FuseEltwiseAndSimple(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "reshapeRnnSeq");
reshapeRnnSeq(graph);
graph.RemoveDroppedNodes();
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "RemoveDroppedEdges");
graph.RemoveDroppedEdges();
}
void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &graph) {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::MKLDNN_LT, "MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations");
DropDoubleReorders(graph);
graph.RemoveDroppedNodes();
MergeTransposeAndReorder(graph);
graph.RemoveDroppedNodes();
graph.RemoveDroppedEdges();
}
void MKLDNNGraphOptimizer::FuseConvolutionAndBias(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Convolution &&
node->getChildEdges().size() == 1 &&
node->getParentEdges().size() == 2 &&
node->getFusedWith().empty();
};
auto isSuitableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
if (childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2)
return false;
auto biasNode = childNode->getParentEdgesAtPort(1)[0]->getParent();
if (biasNode->getType() != Input || !biasNode->isConstant() || biasNode->getChildEdges().size() != 1)
return false;
auto convOutDims = parentNode->getOutputShapeAtPort(0).getDims();
auto biasDims = getNormalizedDimsBySize(biasNode->getOutputShapeAtPort(0).getDims(),
convOutDims.size());
// TODO [NM]: Legacy ConvBias fusion transformation supports both per-tensor (via explicit broadcasing) and per-channel cases.
// Most of the real models contain per-channel bias, so we need to reavaluate the need to support per-tensor variant.
if (convOutDims.size() != biasDims.size() || biasDims.size() < 2)
return false;
if (biasDims[0] != 1 || !dimsEqualStrong(biasDims[1], convOutDims[1]))
return false;
for (int i = 2; i < biasDims.size(); i++) {
if (biasDims[i] != 1)
return false;
}
return true;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!isSuitableChildNode(parentNode, childNode)) {
parent++;
continue;
}
auto childs = childNode->childEdges;
auto parents = childNode->parentEdges;
for (size_t i = 0; i < parents.size(); i++) {
auto p_edge = parents[i].lock();
if (!p_edge) continue;
auto parent = p_edge->getParent();
if (!parent) continue;
if (parent == parentNode) {
for (size_t j = 0; j < childs.size(); j++) {
if (!childs[j].lock())
continue;
auto child = childs[j].lock()->getChild();
if (!child)
continue;
MKLDNNEdgePtr &remEdge = p_edge;
int inNum = 0;
if (remEdge) {
inNum = remEdge->getInputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
remEdge = childs[j].lock();
int outNum = 0;
if (remEdge) {
outNum = remEdge->getOutputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
parent->addEdge(newEdge);
}
} else {
MKLDNNEdgePtr &remEdge = p_edge;
int inNum = 0;
if (remEdge) {
inNum = remEdge->getInputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
auto parentEltwise = parentNode;
MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size()));
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
parent->addEdge(newEdge);
parent->outputShapes[inNum] = Shape(VectorDims{parentEltwise->outputShapes[0].getStaticDims()[1]});
parentEltwise->inputShapes.push_back(parent->outputShapes[0]);
}
}
graph.DropNode(childNode);
parentNode->addOriginalLayer(childNode->getOriginalLayers());
parentNode->addOriginalInputPrecision(childNode->getOriginalInputPrecisionAtPort(1));
}
}
void MKLDNNGraphOptimizer::FuseDeconvolutionAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Deconvolution && node->getChildEdges().size() == 1;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!parentNode->canFuse(childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == Deconvolution)
continue;
graph.RemoveEdge(p_edge);
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseMultiplyAndAdd(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableSecondInput = [](MKLDNNNodePtr node, VectorDims dataDims) {
if (node->getType() != Input || !node->isConstant())
return false;
auto secondInputDims = node->getOutputShapeAtPort(0).getStaticDims();
if (secondInputDims.size() != dataDims.size() || secondInputDims.size() < 2)
return false;
if (secondInputDims[0] != 1 || !dimsEqualWeak(secondInputDims[1], dataDims[1]))
return false;
for (size_t i = 2; i < secondInputDims.size(); i++) {
if (secondInputDims[i] != 1)
return false;
}
return true;
};
auto isSuitableParentNode = [&](MKLDNNNodePtr node) {
if (node->getAlgorithm() != EltwiseMultiply || !node->getFusedWith().empty() ||
node->getParentEdges().size() != 2 || node->getChildEdges().size() != 1)
return false;
return isSuitableSecondInput(node->getParentEdgesAtPort(1)[0]->getParent(), node->getInputShapeAtPort(0).getDims());
};
auto isSuitableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
if (childNode->getAlgorithm() != EltwiseAdd || !childNode->getFusedWith().empty() || childNode->getParentEdges().size() != 2)
return false;
return isSuitableSecondInput(childNode->getParentEdgesAtPort(1)[0]->getParent(), childNode->getInputShapeAtPort(0).getDims()) &&
parentNode->canFuse(childNode);
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!isSuitableChildNode(parentNode, childNode)) {
parent++;
continue;
}
auto childs = childNode->childEdges;
auto parents = childNode->parentEdges;
for (size_t i = 0; i < parents.size(); i++) {
auto p_edge = parents[i].lock();
if (!p_edge) continue;
auto parent = p_edge->getParent();
if (!parent) continue;
if (parent == parentNode) {
for (size_t j = 0; j < childs.size(); j++) {
if (!childs[j].lock())
continue;
auto child = childs[j].lock()->getChild();
if (!child)
continue;
MKLDNNEdgePtr &remEdge = p_edge;
int inNum = 0;
if (remEdge) {
inNum = remEdge->getInputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
remEdge = childs[j].lock();
int outNum = 0;
if (remEdge) {
outNum = remEdge->getOutputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
parent->addEdge(newEdge);
}
} else {
MKLDNNEdgePtr &remEdge = p_edge;
int inNum = 0;
if (remEdge) {
inNum = remEdge->getInputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
auto parentEltwise = parentNode;
MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentEltwise, inNum, parentEltwise->getParentEdges().size()));
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
parent->addEdge(newEdge);
parentEltwise->inputShapes.push_back(parent->getOutputShapeAtPort(0));
}
}
parentNode->addOriginalInputPrecision(childNode->getOriginalInputPrecisionAtPort(1));
parentNode->setAlgorithm(EltwiseMulAdd);
parentNode->setTypeStr("MulAdd");
parentNode->addOriginalLayer(childNode->getOriginalLayers());
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableConvNode = [](MKLDNNNodePtr node) {
bool retVal = false;
if (node->getType() == Convolution) {
if (auto convNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(node)) {
auto rank = convNode->getInputShapeAtPort(0).getRank();
// int8 depthwise convolution does not support fusing zero points in 3D case
if (implication(convNode->isDepthWise(), rank == 4)) {
retVal = true;
}
}
}
return retVal;
};
auto initializeInputZeroPoints = [](MKLDNNNodePtr node, MKLDNNNodePtr parent0, MKLDNNNodePtr parent1) {
auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
if (convNode == nullptr)
IE_THROW() << "Cannot get convolution node " << node->getName();
int IC = node->getInputShapeAtPort(0).getDims()[1];
int OC = node->getOutputShapeAtPort(0).getDims()[1];
if (Shape::UNDEFINED_DIM == IC || Shape::UNDEFINED_DIM == OC) {
return false;
}
if (parent0->getType() == Eltwise) {
if (!parent0->getFusedWith().empty() || !parent1->getFusedWith().empty())
return false;
// The plug-in doesn't support FP32 convolution with input/weights zero points.
// In case weights are in FP32 (or we have zero points on weights which are not supported by INT8 convolution) we cannot use
// INT8 implementation so we have to disable input zero points fusing as well.
if (parent1->getType() != Input || !parent1->isConstant() || parent1->getOriginalOutputPrecisionAtPort(0) != Precision::I8) {
return false;
}
if (parent0->getAlgorithm() != Algorithm::EltwiseSubtract)
return false;
if (parent0->getParentEdges().size() != 2)
return false;
auto arg0 = parent0->getParentEdgesAtPort(1)[0]->getParent();
if (arg0->getType() == Input && arg0->isConstant()) {
if (arg0->getOriginalOutputPrecisionAtPort(0) != Precision::U8)
return false;
if (parent0->getInputShapeAtPort(1).getRank() < 2) {
return false;
}
auto zpDims = parent0->getInputShapeAtPort(1).getDims();
if (zpDims[0] != 1 || !dimsEqualStrong(zpDims[1], IC))
return false;
for (int i = 2; i < zpDims.size(); i++) {
if (zpDims[i] != 1)
return false;
}
auto arg1 = parent0->getParentEdgesAtPort(0)[0]->getParent();
if (arg1->getOriginalOutputPrecisionAtPort(0) != Precision::U8)
return false;
auto zeroPointsConstant = dynamic_cast<MKLDNNInputNode*>(arg0.get());
if (zeroPointsConstant == nullptr)
IE_THROW() << "Cannot cast to Input node";
auto zeroPointsBlob = zeroPointsConstant->getMemoryPtr();
if (zeroPointsBlob == nullptr)
IE_THROW() << "Cannot cast to TBlob internal zero points blob";
auto zeroPointsData = static_cast<const uint8_t*>(zeroPointsBlob->GetPtr());
if (zeroPointsData == nullptr)
IE_THROW() << "zeroPointsBlob has not allocated buffer";
auto zeroPointDataSize = parent0->getInputShapeAtPort(1).getDims()[1];
if (Shape::UNDEFINED_DIM == zeroPointDataSize) {
return false;
}
for (int j = 0; j < zeroPointDataSize; j++) {
convNode->inputZeroPoints.push_back(zeroPointsData[j]);
}
} else {
return false;
}
} else {
return false;
}
if (convNode->outputCompensation.empty()) {
convNode->outputCompensation.resize(OC);
}
return true;
};
auto initializeOutputCompensation = [](MKLDNNNodePtr node) {
auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
if (convNode == nullptr)
IE_THROW() << "Cannot get convolution node " << node->getName();
if (convNode->inputZeroPoints.empty())
return;
auto weightsConstant = dynamic_cast<MKLDNNInputNode*>(convNode->getParentEdgesAtPort(1)[0]->getParent().get());
if (!weightsConstant || !weightsConstant->isConstant())
return;
auto weightsBlob = weightsConstant->getMemoryPtr();
if (weightsBlob == nullptr)
IE_THROW() << "Cannot cast to TBlob internal weights blob";
auto weightsPtr = static_cast<const int8_t*>(weightsBlob->GetPtr());
if (weightsPtr == nullptr)
IE_THROW() << "weightsBlob has not allocated buffer";
ptrdiff_t G = convNode->getGroupNum();
const int groupOffset = convNode->getAlgorithm() == ConvolutionGrouped ? 1 : 0;
auto& weightsConstantDims = weightsConstant->outputShapes[0].getStaticDims();
ptrdiff_t OC = weightsConstantDims[0 + groupOffset];
ptrdiff_t IC = weightsConstantDims[1 + groupOffset];
ptrdiff_t KD = weightsConstantDims.size() == (5 + groupOffset) ? weightsConstantDims[weightsConstantDims.size() - 3] : 1;
ptrdiff_t KH = weightsConstantDims[weightsConstantDims.size() - 2];
ptrdiff_t KW = weightsConstantDims[weightsConstantDims.size() - 1];
for (size_t g = 0; g < G; g++) {
for (size_t oc = 0; oc < OC; oc++) {
int32_t a = 0;
for (size_t ic = 0; ic < IC; ic++) {
for (size_t kd = 0; kd < KD; kd++) {
for (size_t kh = 0; kh < KH; kh++) {
for (size_t kw = 0; kw < KW; kw++) {
size_t widx = g * OC * IC * KD * KH * KW +
oc * IC * KD * KH * KW +
ic * KD * KH * KW +
kd * KH * KW +
kh * KW +
kw;
auto w = static_cast<int32_t>(weightsPtr[widx]);
auto izp = !convNode->inputZeroPoints.empty() ? static_cast<int32_t>(convNode->inputZeroPoints[g * IC + ic]) : 0;
a += w * izp;
auto wzp = !convNode->weightsZeroPoints.empty() ? static_cast<int32_t>(convNode->weightsZeroPoints[g * OC + oc]) : 0;
a -= wzp * izp;
}
}
}
}
convNode->outputCompensation[g * OC + oc] = -a;
}
}
};
for (int i = 0; i < graphNodes.size(); i++) {
auto conv = graphNodes[i];
if (!isSuitableConvNode(conv)) continue;
auto dataEltwise = conv->getParentEdgesAtPort(0)[0]->getParent();
auto weightsEltwise = conv->getParentEdgesAtPort(1)[0]->getParent();
if (initializeInputZeroPoints(conv, dataEltwise, weightsEltwise)) {
auto p_edge = dataEltwise->getParentEdgesAtPort(1)[0];
graph.RemoveEdge(p_edge);
graph.DropNode(dataEltwise);
}
initializeOutputCompensation(conv);
}
}
static bool BF16QuantizeNodeFusing(MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
return childNode->getType() == FakeQuantize &&
one_of(Precision::BF16,
parentNode->getOriginalOutputPrecisionAtPort(0),
childNode->getOriginalOutputPrecisionAtPort(0));
}
void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == FullyConnected && node->getChildEdges().size() == 1 && node->getInputShapeAtPort(0).getRank() != 3;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!parentNode->canFuse(childNode)) {
parent++;
continue;
}
// BF16 Quantize Layer Fusing Disabling
if (BF16QuantizeNodeFusing(parentNode, childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == FullyConnected)
continue;
graph.RemoveEdge(p_edge);
}
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseMatMulAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](const MKLDNNNodePtr& node) {
return node->getType() == MatMul && node->getChildEdges().size() == 1;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSutableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!parentNode->canFuse(childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == MatMul)
continue;
graph.RemoveEdge(p_edge);
}
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isConvolutionNode = [](const MKLDNNNodePtr &node) {
return node->getType() == Convolution;
};
auto is1x1Convolution = [](const std::shared_ptr<MKLDNNConvolutionNode> &conv) {
const auto weightRank = conv->getWeightDims().size();
return conv->getWeightDims()[weightRank - 1] == 1 && conv->getWeightDims()[weightRank - 2] == 1;
};
auto isSuitableParentConvolution = [&](MKLDNNNodePtr node) {
if (node->isDropped())
return false;
const auto conv = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(node);
if (conv == nullptr)
IE_THROW() << "Cannot cast to convolution node " << node->getName();
if (!conv->weightsZeroPoints.empty())
return false;
const auto &strides = conv->getStride();
const auto &paddings = conv->getPaddingL();
const auto &inDims = node->getInputShapeAtPort(0).getDims();
const auto &outDims = node->getOutputShapeAtPort(0).getDims();
bool isSupportedParams = conv->getGroupNum() == 1 &&
inDims.size() == 4 &&
dimsEqualStrong(inDims[inDims.size() - 1], outDims[outDims.size() - 1]) &&
dimsEqualStrong(inDims[inDims.size() - 2], outDims[outDims.size() - 2]) &&
is1x1Convolution(conv) && // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
everyone_is(1, strides[strides.size() - 1], strides[strides.size() - 2]) &&
everyone_is(0, paddings[paddings.size() - 1], paddings[paddings.size() - 2]) &&
!conv->canBeExecutedInInt8();
if (!isSupportedParams) return false;
return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
};
auto isSuitableChildConvolution = [&](const MKLDNNNodePtr &parentNode, const MKLDNNNodePtr &childNode) {
if (parentNode->isDropped() || childNode->isDropped())
return false;
const auto convChild = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(childNode);
if (convChild == nullptr)
IE_THROW() << "Cannot cast to convolution node " << childNode->getName();
const auto convParent = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
if (convParent == nullptr)
IE_THROW() << "Cannot cast to convolution node " << parentNode->getName();
if (!everyone_is(Precision::FP32, convParent->getOriginalOutputPrecisionAtPort(0), convChild->getOriginalInputPrecisionAtPort(0),
convChild->getOriginalOutputPrecisionAtPort(0)))
return false;
auto parentOutputPrecision = !parentNode->fusedWith.empty()
? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)
: parentNode->getOriginalOutputPrecisionAtPort(0);
auto childOutputPrecision = !childNode->fusedWith.empty()
? childNode->fusedWith[childNode->fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)
: childNode->getOriginalOutputPrecisionAtPort(0);
if (!everyone_is(Precision::FP32, parentOutputPrecision, childOutputPrecision))
return false;
if (!convChild->inputZeroPoints.empty() || !convChild->weightsZeroPoints.empty())
return false;
bool withBias = convChild->getOriginalInputPrecisions().size() == 3;
const auto weightRank = convChild->getWeightDims().size();
const auto stridesSize = convChild->getStride().size();
bool isSupportedParams = dimsEqualStrong(convChild->outputShapes[0].getDims()[1], convChild->getGroupNum()) &&
convChild->outputShapes[0].getDims()[1] != 1 &&
everyone_is(3, convChild->getWeightDims()[weightRank - 1], convChild->getWeightDims()[weightRank - 2]) &&
everyone_is(1, convChild->getPaddingL()[stridesSize - 1], convChild->getPaddingL()[stridesSize - 2]) &&
everyone_is(1, convChild->getPaddingR()[stridesSize - 1], convChild->getPaddingR()[stridesSize - 2]) &&
everyone_is(1, convChild->getDilation()[stridesSize - 1] + 1, convChild->getDilation()[stridesSize - 2] + 1) &&
convChild->getStride()[stridesSize - 1] == convChild->getStride()[stridesSize - 2] &&
withBias &&
one_of(convChild->getStride()[stridesSize - 1], 1, 2) &&
childNode->getOutputShapeAtPort(0).getRank() == 4;
return isSupportedParams;
};
auto isFusingWorthwhile = [&](const MKLDNNNodePtr &parentNode, const MKLDNNNodePtr &childNode) {
if (!childNode->inputShapes[0].isStatic() || !childNode->outputShapes[0].isStatic()) {
return false;
}
auto inDims = childNode->inputShapes[0].getStaticDims();
auto outDims = childNode->outputShapes[0].getStaticDims();
int elemSize = childNode->getOriginalOutputPrecisionAtPort(0).size();
int L3_cache_size = utils::get_cache_size(3, false);
int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
auto parentConvolutionNode = std::dynamic_pointer_cast<MKLDNNConvolutionNode>(parentNode);
if (parentConvolutionNode == nullptr)
IE_THROW() << "Cannot get convolution node " << parentNode->getName();
if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2) || impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_common))
return false;
return (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
};
for (int i = 0; i < graphNodes.size(); i++) {
if (!isConvolutionNode(graphNodes[i])) continue;
auto parentConvNode = graphNodes[i];
if (!isSuitableParentConvolution(parentConvNode)) continue;
auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
if (!isSuitableChildConvolution(parentConvNode, childConvNode)) continue;
if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
parentConvNode->addFusedNode(childConvNode);
for (auto node : childConvNode->getFusedWith()) {
parentConvNode->addFusedNode(node);
}
childConvNode->clearFusedWith();
graph.DropDWConvNode(childConvNode);
}
}
// TODO [NM]: unite with FuseConvolutionAndSimpleOperation
void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperationThroughMaxPool(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return (node->getType() == Convolution || node->getType() == BinaryConvolution) && node->getChildEdges().size() == 1 &&
node->getOriginalOutputPrecisionAtPort(0) == Precision::FP32;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (childNode->getAlgorithm() != PoolingMax || childNode->getChildEdges().size() != 1) {
parent++;
continue;
}
auto fuseCandidate = childNode->getChildEdgeAt(0)->getChild();
if (parentNode->getType() == BinaryConvolution && !parentNode->canFuse(fuseCandidate)) {
parent++;
continue;
}
if (!one_of(fuseCandidate->getAlgorithm(), EltwiseRelu, EltwiseGelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseTanh,
EltwiseSwish, EltwiseHswish, EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven,
EltwiseRoundHalfAwayFromZero, EltwiseAbs, EltwiseSqrt, EltwiseSoftRelu)) {
parent++;
continue;
}
parentNode->addFusedNode(fuseCandidate);
parentNode->addOriginalLayer(fuseCandidate->getOriginalLayers());
auto parentEdges = fuseCandidate->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent() == childNode)
continue;
graph.RemoveEdge(p_edge);
}
graph.DropNode(fuseCandidate);
}
}
void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return (node->getType() == Convolution || node->getType() == BinaryConvolution) && node->getChildEdges().size() == 1;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
const auto parentNodeType = parentNode->getType();
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!parentNode->canFuse(childNode)) {
parent++;
continue;
}
// BF16 Quantize Layer Fusing Disabling
if (BF16QuantizeNodeFusing(parentNode, childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == parentNodeType)
continue;
graph.RemoveEdge(p_edge);
}
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FusePoolingAndFakeQuantize(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
if (node->getType() == Pooling) {
if (!one_of(node->getOriginalInputPrecisionAtPort(0), Precision::U8, Precision::I8))
return false;
return node->getChildEdges().size() == 1 && node->getAlgorithm() == Algorithm::PoolingAvg;
}
return false;
};
auto isSuitableChildNode = [](MKLDNNNodePtr node) {
return node->getType() == FakeQuantize && node->getAlgorithm() != Algorithm::FQBinarization;
};
for (int i = 0; i < graphNodes.size(); i++) {
auto parent = graphNodes[i];
if (!isSuitableParentNode(parent)) continue;
auto child = parent->getChildEdgeAt(0)->getChild();
if (!isSuitableChildNode(child)) continue;
child->fuseInto(parent);
auto parents = child->parentEdges;
for (size_t i = 0; i < parents.size(); i++) {
auto p_edge = parents[i].lock();
if (p_edge->getParent()->getType() == Pooling)
continue;
graph.RemoveEdge(p_edge);
}
graph.DropNode(child);
}
}
/**
* Check if there is a data dependency between parent and child
* BFS starting from parent and comparing with child
*
* @param parent head of BFS
* @param child node we try to find
* @return True if child is one of data supplier
*/
static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
const std::shared_ptr<MKLDNNNode> &child) {
std::set<MKLDNNNode*> visited;
std::list<MKLDNNNode*> nextLayers {parent.get()};
for (; !nextLayers.empty();) {
auto layer = *nextLayers.begin();
if (layer == child.get()) return true;
for (auto oe : layer->getChildEdges()) {
auto nn = oe.lock()->getChild();
if (visited.find(nn.get()) == visited.end()) {
nextLayers.push_back(nn.get());
visited.insert(nn.get());
}
}
nextLayers.pop_front();
}
return false;
}
/*
* Before:
*
* *** *** *** ***
* | | | |
* +========+ +========+ +========+ +========+
* | any | | conv 2 | | any | | conv 2 |
* +========+ +========+ +========+ +========+
* | | | |
* +=====================+ +=====================+
* | Sum | or | Sum |
* +=====================+ +=====================+
* | |
* +===============+ ***
* | Relu |
* +===============+
* |
* ***
*
* After:
*
* *** ***
* | |
* +========+ +========+
* | any |-------| |
* +========+ | conv2 |
* | + |
* | sum |
* | + |
* | [relu] |
* | |
* +========+
* |
* +-------+
* |
* ***
*/
void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
auto &graphNodes = graph.GetNodes();
auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr child) {
return child->getType() == Eltwise &&
one_of(child->getAlgorithm(), EltwiseRelu, EltwiseElu, EltwiseSigmoid, EltwiseClamp, EltwiseSwish, EltwiseHswish,
EltwiseMish, EltwiseHsigmoid, EltwiseRoundHalfToEven, EltwiseRoundHalfAwayFromZero, EltwiseSoftRelu);
};
for (auto &graphNode : graphNodes) {
// TODO [DS]: at this moment this transformation prohibit for dynamic case
if (graphNode->getType() != Eltwise || graphNode->getAlgorithm() != EltwiseAdd || graphNode->isDynamicNode() ||
std::dynamic_pointer_cast<MKLDNNEltwiseNode>(graphNode)->isWithBroadcast())
continue;
// TODO: Enlarge to several inputs
bool isSuitableNode = graphNode->getParentEdges().size() == 2;
if (!isSuitableNode)
continue;
auto parent1 = graphNode->getParentEdgesAtPort(0)[0]->getParent();
auto parent2 = graphNode->getParentEdgesAtPort(1)[0]->getParent();
bool isSuitableParent1 = parent1->getType() == Convolution || parent1->getType() == BinaryConvolution;
bool isSuitableParent2 = parent2->getType() == Convolution || parent2->getType() == BinaryConvolution;
auto canFuseSum = [](MKLDNNBinaryConvolutionNode *binConv, MKLDNNNodePtr fuseCandidate) {
if (binConv->getImplType() == impl_desc_type::ref)
return false;
if (binConv->isFusedWith(FakeQuantize))
return false;
if (fuseCandidate->getAlgorithm() == EltwiseAdd) {
for (auto& fusedNode : binConv->fusedWith) {
const auto eltwise = std::dynamic_pointer_cast<MKLDNNEltwiseNode>(fusedNode);
if (eltwise && eltwise->isSpecialConvolutionAddFusing()) {
return false;
}
}
return true;
}
return false;
};
auto* binConvNode1 = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parent1.get());
if (binConvNode1) {
isSuitableParent1 = isSuitableParent1 && canFuseSum(binConvNode1, graphNode);
}
auto* binConvNode2 = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parent2.get());
if (binConvNode2) {
isSuitableParent2 = isSuitableParent2 && canFuseSum(binConvNode2, graphNode);
}
auto* convNode1 = dynamic_cast<MKLDNNConvolutionNode *>(parent1.get());
if (convNode1) {
if (!convNode1->canBeExecutedInInt8()) {
isSuitableParent1 = isSuitableParent1 && convNode1->getFusedWith().empty();
}
}
auto* convNode2 = dynamic_cast<MKLDNNConvolutionNode *>(parent2.get());
if (convNode2) {
if (!convNode2->canBeExecutedInInt8()) {
isSuitableParent2 = isSuitableParent2 && convNode2->getFusedWith().empty();
}
}
if (!isSuitableParent1 && !isSuitableParent2)
continue;
auto mergedConv = isSuitableParent1 ? parent1 : parent2;
auto peerNode = isSuitableParent1 ? parent2 : parent1;
if (isSuitableParent1 && isSuitableParent2) {
if ((peerNode->getType() == Convolution || peerNode->getType() == BinaryConvolution) &&
mergedConv->getChildEdges().size() != 1) {
mergedConv = parent2;
peerNode = parent1;
}
}
if (peerNode->isConstant())
continue;
auto sum = graphNode;
if (mergedConv->isConstant() && !sum->isConstant())
continue;
auto lastNode = sum;
bool fuse_allowed = mergedConv->getChildEdges().size() == 1;
for (size_t j = 0; fuse_allowed && j < mergedConv->getParentEdges().size(); j++)
if (mergedConv->getParentEdgesAtPort(j)[0]->getParent() == peerNode)
fuse_allowed = false;
// Fused Conv+Sum prim will be used inplace. That's mean that input blob will
// be overwritten. Should verify that all other consumer already read it and
// we can spoil input data.
// TODO: rewrite once we add "Inplace" reporting mechanism
for (auto & edge : peerNode->getChildEdges()) {
if (!fuse_allowed)
break;
fuse_allowed &= is_data_dependency(edge.lock()->getChild(), sum);
}
if (!fuse_allowed) continue;
if (graphNode->getChildEdges().size() == 1 &&
isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) {
auto relu_shared = graphNode->getChildEdgeAt(0)->getChild();
lastNode = relu_shared;
if (mergedConv->isConstant() && !lastNode->isConstant())
continue;
sum->fuseInto(mergedConv);
}
lastNode->fuseInto(mergedConv);
if (mergedConv->fusedWith.size() > 0 &&
(mergedConv->fusedWith[0]->getType() == Convolution || mergedConv->fusedWith[0]->getType() == BinaryConvolution)) {
// Merged with DW_conv. Shape may change
mergedConv->inputShapes.push_back(mergedConv->fusedWith[0]->outputShapes[0]);
} else {
mergedConv->inputShapes.push_back(mergedConv->outputShapes[0]);
}
size_t childIdx = 0lu;
for (; childIdx < peerNode->getChildEdges().size(); childIdx++) {
if (peerNode->getChildEdgeAt(childIdx)->getChild() == sum) {
break;
}
}
int peer_port = peerNode->getChildEdgeAt(childIdx)->getInputNum();
peerNode->getChildEdgeAt(childIdx)->drop();
int childPort = 1;
auto* mergedConvNode = dynamic_cast<MKLDNNConvolutionNode*>(mergedConv.get());
if (mergedConvNode != nullptr)
childPort = mergedConvNode->getParentEdges().size();
auto* mergedBinConvNode = dynamic_cast<MKLDNNBinaryConvolutionNode*>(mergedConv.get());
if (mergedBinConvNode != nullptr)
childPort = mergedBinConvNode->getParentEdges().size();
MKLDNNEdgePtr edgePtr(new MKLDNNEdge(peerNode, mergedConv, peer_port, childPort));
graph.GetEdges().push_back(edgePtr);
mergedConv->addEdge(edgePtr);
std::vector<MKLDNNEdgeWeakPtr> edges_to_reconnect = lastNode->getChildEdges();
for (auto &edge_w : edges_to_reconnect) {
auto edge = edge_w.lock();
auto child = edge->getChild();
int idxParent = edge->getInputNum();
int idxChild = edge->getOutputNum();
// reconnect after activation/sum. Port index must be 0
IE_ASSERT(idxParent == 0);
edge->drop();
MKLDNNEdgePtr newEdge(new MKLDNNEdge(mergedConv, child, idxParent, idxChild));
graph.GetEdges().push_back(newEdge);
child->addEdge(newEdge);
}
if (lastNode != sum) {
lastNode->remove();
}
sum->remove();
}
}
void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return (node->getType() == MVN) && (node->getChildEdges().size() == 1);
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!parentNode->canFuse(childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == MVN)
continue;
graph.RemoveEdge(p_edge);
}
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Interpolate && node->getChildEdges().size() == 1;
};
auto isSuitableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
// Avoid cycle dependencies
for (auto &childParentEdge : childNode->getParentEdges()) {
for (auto &parentParentEdge : parentNode->getParentEdges()) {
if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent())
return false;
}
}
if (!childNode->getFusedWith().empty())
return false;
auto interpolateNode = dynamic_cast<MKLDNNInterpolateNode*>(parentNode.get());
return interpolateNode->canFuse(childNode);
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!isSuitableChildNode(parentNode, childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == Interpolate)
continue;
graph.RemoveEdge(p_edge);
}
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseNormalizeL2AndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == NormalizeL2 && node->getChildEdges().size() == 1;
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!parentNode->canFuse(childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize || childNode->getType() == Eltwise) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == NormalizeL2)
continue;
graph.RemoveEdge(p_edge);
}
}
graph.DropNode(childNode);
}
}
void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Eltwise && node->getChildEdges().size() == 1;
};
auto isSuitableChildNode = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
if (parentNode->isConstant() && !childNode->isConstant())
return false;
for (auto &childParentEdge : childNode->getParentEdges()) {
// WA to prevent unsupported reorder exception issue in some cases
if (childParentEdge.lock()->getParent()->getType() == Split) {
return false;
}
// Avoid cycle dependencies
for (auto &parentParentEdge : parentNode->getParentEdges()) {
if (childParentEdge.lock()->getParent() == parentParentEdge.lock()->getParent())
return false;
}
}
if (!childNode->getFusedWith().empty())
return false;
return parentNode->canFuse(childNode);
};
auto parent = graphNodes.begin();
while (parent != graphNodes.end()) {
auto parentNode = *parent;
if (!isSuitableParentNode(parentNode)) {
parent++;
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!isSuitableChildNode(parentNode, childNode)) {
parent++;
continue;
}
childNode->fuseInto(parentNode);
if (childNode->getType() == FakeQuantize) {
auto parentEdges = childNode->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (p_edge->getParent()->getType() == Eltwise)
continue;
graph.RemoveEdge(p_edge);
}
graph.DropNode(childNode);
} else if (childNode->getType() == Eltwise) {
auto children = childNode->childEdges;
auto parents = childNode->parentEdges;
auto initialParentInNum = parentNode->getParentEdges().size();
for (size_t i = 0; i < parents.size(); i++) {
auto p_edge = parents[i].lock();
if (!p_edge) continue;
auto parent = p_edge->getParent();
if (!parent) continue;
if (parent == parentNode) {
for (size_t j = 0; j < children.size(); j++) {
if (!children[j].lock())
continue;
auto child = children[j].lock()->getChild();
if (!child)
continue;
MKLDNNEdgePtr &remEdge = p_edge;
int inNum = 0;
if (remEdge) {
inNum = remEdge->getInputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
remEdge = children[j].lock();
int outNum = 0;
if (remEdge) {
outNum = remEdge->getOutputNum();
remEdge->drop();
graph.RemoveEdge(remEdge);
}
MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, child, inNum, outNum));
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
parent->addEdge(newEdge);
parent->outputShapes[inNum] = child->inputShapes[outNum];
}
} else {
MKLDNNEdgePtr &remEdge = p_edge;
int inNum = 0;
int outNum = parentNode->getParentEdges().size();
if (remEdge) {
inNum = remEdge->getInputNum();
// Need to keep order for MulAdd
if (childNode->getAlgorithm() == EltwiseMulAdd) {
outNum = initialParentInNum + remEdge->getOutputNum() - 1;
}
remEdge->drop();
graph.RemoveEdge(remEdge);
}
MKLDNNEdgePtr newEdge(new MKLDNNEdge(parent, parentNode, inNum, outNum));
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
parent->addEdge(newEdge);
parentNode->inputShapes.push_back(parent->outputShapes[0]);
}
}
graph.DropNode(childNode);
} else {
graph.DropNode(childNode);
}
}
}
void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
std::set<MKLDNNNodePtr> processed;
int graphNodesSize = graph.GetNodes().size();
for (int i = 0; i < graphNodesSize; i++) {
MKLDNNNodePtr& node = graph.GetNodes()[i];
if (processed.find(node) == processed.end() && node->getType() == Reorder
&& node->getChildEdges().size() == 1
&& node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) {
auto nextNode = node->getChildEdgeAt(0)->getChild();
MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get());
if (n == nullptr)
IE_THROW() << "Cannot get reorder layer " << node->getName();
MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get());
if (nn == nullptr)
IE_THROW() << "Cannot get reorder layer " << nextNode->getName();
MKLDNNNodePtr p = n->getParentEdgesAtPort(0)[0]->getParent();
MKLDNNNodePtr c = nn->getChildEdgesAtPort(0)[0]->getChild();
auto oldEdgeNum = n->getParentEdgesAtPort(0)[0]->getInputNum();
graph.DropNode(node);
graph.DropNode(nextNode);
processed.insert(node);
processed.insert(nextNode);
MKLDNNEdgePtr edge;
for (auto cur : p->getChildEdgesAtPort(oldEdgeNum)) {
if (cur->getChild() == c)
edge = cur;
}
if (!edge) IE_THROW() << "Inappropriate graph processing";
std::string layerName = edge->getParent()->getName() + "_ScaleReorder_" + edge->getChild()->getName();
graph.InsertReorder(edge, layerName, n->getInput(), nn->getOutput(), false);
graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end());
}
}
}
void MKLDNNGraphOptimizer::FuseBroadcastAndEltwise(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
for (auto &graphNode : graphNodes) {
if (graphNode->getType() != Generic
|| graphNode->getTypeStr() != "Broadcast"
|| graphNode->getChildEdges().size() != 1lu
|| graphNode->getChildEdgeAt(0)->getChild()->getType() != Eltwise)
continue;
MKLDNNNodePtr& broadcastNode = graphNode;
MKLDNNNodePtr eltwiseNode = broadcastNode->getChildEdgeAt(0)->getChild();
eltwiseNode->inputShapes[broadcastNode->getChildEdgeAt(0)->getOutputNum()]
= broadcastNode->getInputShapeAtPort(0);
auto& edges = graph.GetEdges();
for (size_t i = 1lu; i < broadcastNode->getParentEdges().size(); i++) {
auto constParent = broadcastNode->getParentEdgesAtPort(i)[0]->getParent();
for (auto it = edges.begin(); it != edges.end(); it++) {
if ((*it) == constParent->getChildEdgeAt(0)) {
edges.erase(it);
constParent->remove();
break;
}
}
}
graph.DropNode(broadcastNode);
}
}
void MKLDNNGraphOptimizer::FuseClampAndFakeQuantize(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableClampNode = [](MKLDNNNodePtr node) {
return node->getType() == Eltwise && node->getChildEdges().size() == 1 && node->getAlgorithm() == EltwiseClamp;
};
auto isSuitableFakeQuantizeNode = [](MKLDNNNodePtr node) {
return node->getType() == FakeQuantize && node->getAlgorithm() != FQBinarization;
};
auto fuseClampAndFakeQuantizeNodes = [](MKLDNNNodePtr parent, MKLDNNNodePtr child) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent.get());
if (eltwiseNode == nullptr)
IE_THROW() << "Cannot cast " << parent->getName() << " to Eltwise node";
auto* fakeQuantizeNode = dynamic_cast<MKLDNNFakeQuantizeNode*>(child.get());
if (fakeQuantizeNode == nullptr)
IE_THROW() << "Cannot cast " << child->getName() << " to FakeQuantize node";
const std::vector<float>& cropLowData = fakeQuantizeNode->getCropLow();
const std::vector<float>& cropHighData = fakeQuantizeNode->getCropHigh();
std::vector<float> newCropLow(cropLowData.size());
std::vector<float> newCropHigh(cropHighData.size());
for (int i = 0; i < cropLowData.size(); i++)
newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getAlpha());
for (int i = 0; i < cropHighData.size(); i++)
newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getBeta());
fakeQuantizeNode->setCropLow(newCropLow);
fakeQuantizeNode->setCropHigh(newCropHigh);
return true;
};
for (int i = 0; i < graphNodes.size(); i++) {
auto parent = graphNodes[i];
if (!isSuitableClampNode(parent)) continue;
auto child = parent->getChildEdgeAt(0)->getChild();
if (!isSuitableFakeQuantizeNode(child)) continue;
if (fuseClampAndFakeQuantizeNodes(parent, child)) {
graph.DropNode(parent);
}
}
}
void MKLDNNGraphOptimizer::FusePerformedAsScaleShiftAndFakeQuantize(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto getConstPort = [](const MKLDNNNodePtr node) -> int {
if (node->getParentEdgesAtPort(0)[0]->getParent()->getType() == Input && node->getParentEdgesAtPort(0)[0]->getParent()->isConstant()) {
return 0;
} else if (node->getParentEdgesAtPort(1)[0]->getParent()->getType() == Input && node->getParentEdgesAtPort(1)[0]->getParent()->isConstant()) {
return 1;
} else {
return -1;
}
};
auto isSuitableScaleShiftNode = [getConstPort](MKLDNNNodePtr node) {
if (one_of(node->getAlgorithm(), EltwiseAdd, EltwiseSubtract, EltwiseMultiply, EltwiseDivide, EltwiseMulAdd)) {
MKLDNNNode *parent = nullptr;
if (node->getAlgorithm() != EltwiseMulAdd) {
const auto constPort = getConstPort(node);
if (constPort == -1) {
return false;
}
parent = node->getParentEdgesAtPort(1 - constPort)[0]->getParent().get();
}
return node->getType() == Eltwise && node->getChildEdges().size() == 1 && node->canBePerformedAsScaleShift(parent);
}
return false;
};
auto isSuitableFakeQuantizeNode = [](MKLDNNNodePtr node) {
return node->getType() == FakeQuantize && node->getAlgorithm() != FQBinarization;
};
auto fuseScaleShiftAndFakeQuantizeNodes = [getConstPort](MKLDNNNodePtr parent, MKLDNNNodePtr child) {
auto fakeQuantizeNode = std::dynamic_pointer_cast<MKLDNNFakeQuantizeNode>(child);
if (fakeQuantizeNode == nullptr)
IE_THROW() << "Cannot cast " << child->getName() << " to FakeQuantize node";
std::vector<float> scalesBuffer;
std::vector<float> shiftsBuffer;
auto parentEltwise = std::dynamic_pointer_cast<MKLDNNEltwiseNode>(parent);
if (!parentEltwise) {
IE_THROW() << "Cannot cast " << parent->getName() << " to Eltwise node";
}
std::tie(scalesBuffer, shiftsBuffer) = parentEltwise->getScalesAndShifts(parent->getParentEdgesAtPort(1 - getConstPort(parent))[0]->getParent().get());
const auto &outputShape = child->getOutputShapeAtPort(0);
VectorDims outputDims = outputShape.getDims();
const size_t channelPos = outputDims.size() > 1 ? 1 : 0;
if (outputShape.isDynamic()) {
if (outputDims[channelPos] == Shape::UNDEFINED_DIM) {
if (scalesBuffer.size() > 1) {
outputDims[channelPos] = scalesBuffer.size();
} else if (shiftsBuffer.size() > 1) {
outputDims[channelPos] = shiftsBuffer.size();
} else {
return false;
}
}
}
scalesBuffer = makeAlignedBuffer(outputDims[channelPos], scalesBuffer, 1);
shiftsBuffer = makeAlignedBuffer(outputDims[channelPos], shiftsBuffer, 1);
for (int i = 0; i < scalesBuffer.size(); i++)
if (scalesBuffer[i] == 0.f)
return false;
const std::vector<float>& cropLowData = fakeQuantizeNode->getCropLow();
const std::vector<float>& cropHighData = fakeQuantizeNode->getCropHigh();
const std::vector<float>& inputScaleData = fakeQuantizeNode->getInputScale();
const std::vector<float>& inputShiftData = fakeQuantizeNode->getInputShift();
std::vector<float> newCropLow(scalesBuffer.size());
std::vector<float> newCropHigh(scalesBuffer.size());
std::vector<float> newInputScale(scalesBuffer.size());
std::vector<float> newInputShift(scalesBuffer.size());
for (int i = 0; i < newCropLow.size(); i++) {
float cl = cropLowData.size() == 1 ? cropLowData[0] : cropLowData[i];
float ch = cropHighData.size() == 1 ? cropHighData[0] : cropHighData[i];
float newCL = (cl - shiftsBuffer[i]) / scalesBuffer[i];
float newCH = (ch - shiftsBuffer[i]) / scalesBuffer[i];
newCropLow[i] = std::min(newCL, newCH);
newCropHigh[i] = std::max(newCL, newCH);
if (std::isinf(newCropLow[i])) {
newCropLow[i] = std::numeric_limits<float>::lowest();
}
if (std::isinf(newCropHigh[i])) {
newCropHigh[i] = std::numeric_limits<float>::max();
}
}
std::vector<float> zeroShift(newInputScale.size(), 0.f);
const auto isSubnormal = [](const float value) {
const uint32_t *u32data = reinterpret_cast<const uint32_t*>(&value);
return (*u32data) && (((*u32data) & (0xFF << 23)) == 0);
};
for (int i = 0; i < newInputScale.size(); i++) {
float isc = inputScaleData.size() == 1 ? inputScaleData[0] : inputScaleData[i];
newInputScale[i] = isc * scalesBuffer[i];
if (isSubnormal(newInputScale[i])) {
newInputScale[i] = 0.f;
// zero value have to be shifted if it's not in input range
float cl = cropLowData.size() == 1 ? cropLowData[0] : cropLowData[i];
float ch = cropHighData.size() == 1 ? cropHighData[0] : cropHighData[i];
if (0.f < cl) {
zeroShift[i] = isc * cl;
}
if (ch < 0.f) {
zeroShift[i] = isc * ch;
}
}
}
for (int i = 0; i < newInputShift.size(); i++) {
float isc = inputScaleData.size() == 1 ? inputScaleData[0] : inputScaleData[i];
float ish = inputShiftData.size() == 1 ? inputShiftData[0] : inputShiftData[i];
newInputShift[i] = ish + shiftsBuffer[i] * isc + zeroShift[i];
if (isSubnormal(newInputShift[i])) {
newInputShift[i] = 0.f;
}
}
fakeQuantizeNode->setCropLow(newCropLow);
fakeQuantizeNode->setCropHigh(newCropHigh);
fakeQuantizeNode->setInputScale(newInputScale);
fakeQuantizeNode->setInputShift(newInputShift);
return true;
};
for (int i = 0; i < graphNodes.size(); i++) {
auto parent = graphNodes[i];
if (!isSuitableScaleShiftNode(parent)) continue;
auto child = parent->getChildEdgeAt(0)->getChild();
if (!isSuitableFakeQuantizeNode(child)) continue;
if (fuseScaleShiftAndFakeQuantizeNodes(parent, child)) {
auto parentEdges = parent->parentEdges;
for (auto &parentEdge : parentEdges) {
auto p_edge = parentEdge.lock();
if (!p_edge->getParent()->isConstant())
continue;
graph.RemoveEdge(p_edge);
}
graph.DropNode(parent);
}
}
}
void MKLDNNGraphOptimizer::MergeTransposeAndReorder(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSuitableParentNode = [](MKLDNNNodePtr node) {
return node->getType() == Transpose && node->getChildEdges().size() == 1;
};
auto isSuitableChildNode = [](MKLDNNNodePtr node) {
return node->getType() == Reorder && node->getChildEdges().size() == 1;
};
// Method checkAscendingSummaryOrder() checks that after the sequential execution of Transpose and Reorder nodes,
// the order of the elements in the memory will not change. In other words, that Transpose+Reorder is identical permutation.
auto checkAscendingSummaryOrder = [](std::shared_ptr<MKLDNNNode> &parentNode, std::shared_ptr<MKLDNNNode> &childNode) -> bool {
auto* transposeNode = dynamic_cast<MKLDNNTransposeNode*>(parentNode.get());
auto* reorderNode = dynamic_cast<MKLDNNReorderNode*>(childNode.get());
if (!transposeNode || !reorderNode) {
return false;
}
auto& transposeOrder = transposeNode->getOrder();
auto layoutOrder = transposeNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].desc->as<BlockedMemoryDesc>()->getOrder();
auto inBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc->as<BlockedMemoryDesc>();
auto outBlockedDesc = reorderNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].desc->as<BlockedMemoryDesc>();
auto& inOrder = inBlockedDesc->getOrder();
auto& outOrder = outBlockedDesc->getOrder();
if (transposeOrder.size() != layoutOrder.size() || layoutOrder.size() != inOrder.size() || inOrder.size() != outOrder.size()) {
return false;
}
// revLayoutOrder - reverse permutation for layoutOrder
auto revLayoutOrder = VectorDims(layoutOrder.size());
for (int i = 0; i < revLayoutOrder.size(); i++) {
revLayoutOrder[layoutOrder[i]] = i;
}
// newTransposeOrder - Transpose layout-aware permutation
auto newTransposeOrder = VectorDims(transposeOrder.size());
for (int i = 0; i < newTransposeOrder.size(); i++) {
newTransposeOrder[i] = layoutOrder[transposeOrder[revLayoutOrder[i]]];
}
// reorderOrder - Reorder layout-aware permutation
auto reorderOrder = VectorDims(outOrder.size());
for (int i = 0; i < reorderOrder.size(); i++) {
for (int j = 0; j < reorderOrder.size(); j++) {
if (outOrder[i] == inOrder[j]) {
reorderOrder[i] = j;
continue;
}
}
}
// summaryOrder - resulting Transpose+Reorder permutation
auto summaryOrder = VectorDims(transposeOrder.size());
for (int i = 0; i < summaryOrder.size(); i++) {
summaryOrder[i] = reorderOrder[newTransposeOrder[i]];
}
// check that Transpose+Reorder is the identical permutation
for (int i = 0; i < summaryOrder.size(); i++) {
if (summaryOrder[i] != i) {
return false;
}
}
return true;
};
// Transpose and Reorder do opposite permutation to each other.
// Example:
// chain [physical layout: NCHW, logical layout: NCHW] -> Transpose(order=0312) -> [physical layout: NWCH, logical layout: NCHW] ->
// Reorder(nchw->nhwc) -> [physical layout: NCHW, logical layout: NHWC] can be replaced with Reorder(nchw->nhwc; isOptimized=true)
// which will just reinterprets layout without physical change of the memory.
// Two cases are possible:
// 1) inPrec = outPrec
// In this case, we replace Transpose+Reorder pattern with a new Reorder that does nothing.
// 2) inPrec != outPrec
// As in the first case, we also replace Transpose+Reorder pattern with a new Reorder.
// Additionally, we insert another Reorder that performs the conversion from the input precision (inPrec)
// to the output precision (outPrec)
auto mergeTransposeAndReorder = [&](std::shared_ptr<MKLDNNNode>& parentNode, std::shared_ptr<MKLDNNNode>& childNode) {
auto parentParentNode = parentNode->getParentEdgesAtPort(0)[0]->getParent();
auto parentParentConstNode = parentNode->getParentEdgesAtPort(1)[0]->getParent();
auto childChildNode = childNode->getChildEdgeAt(0)->getChild();
auto &remEdge = parentParentConstNode->getChildEdgeAt(0);
remEdge->drop();
auto& edges = graph.GetEdges();
for (auto it = edges.begin(); it != edges.end(); it++) {
if ((*it) == remEdge) {
edges.erase(it);
parentParentConstNode->remove();
break;
}
}
graph.DropNode(parentNode);
graph.DropNode(childNode);
auto& inDesc = parentNode->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc;
auto& outDesc = childNode->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].desc;
auto inPrec = inDesc->getPrecision();
auto outPrec = outDesc->getPrecision();
auto reorderInDesc = inDesc;
auto reorderOutDesc = outDesc->cloneWithNewPrecision(inPrec);
std::string reorderlayerName = parentParentNode->getName() + "_" +
MKLDNNReorderNode::getReorderArgs(*reorderInDesc, *reorderOutDesc) + "_" + "fake";
MKLDNNEdgePtr edge;
for (auto &childEdge : parentParentNode->getChildEdges()) {
if (childEdge.lock()->getChild() == childChildNode) {
edge = childEdge.lock();
break;
}
}
if (!edge) {
IE_THROW() << "Transpose node '" << parentNode->getName() << "' has invalid edges.";
}
auto reorderNode = graph.InsertReorder(edge, reorderlayerName, *reorderInDesc, *reorderOutDesc, true);
// case 2
if (inPrec != outPrec) {
auto reorderInDesc2 = reorderOutDesc;
auto reorderOutDesc2 = outDesc;
std::string reorderLayerName2 = reorderNode->getName() + "_" +
MKLDNNReorderNode::getReorderArgs(*reorderInDesc2, *reorderOutDesc2) + "_" + childChildNode->getName();
graph.InsertReorder(reorderNode->getChildEdgeAt(0), reorderLayerName2, *reorderInDesc2, *reorderOutDesc2, false);
}
};
for (int i = 0; i < graphNodes.size(); i++) {
auto parentNode = graphNodes[i];
if (!isSuitableParentNode(parentNode)) {
continue;
}
auto childNode = parentNode->getChildEdgeAt(0)->getChild();
if (!isSuitableChildNode(childNode)) {
continue;
}
if (checkAscendingSummaryOrder(parentNode, childNode)) {
mergeTransposeAndReorder(parentNode, childNode);
}
}
}
void MKLDNNGraphOptimizer::reshapeRnnSeq(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isSutableParentNode = [](MKLDNNNodePtr node) {
if (node->type != RNNSeq)
return false;
auto rnnNode = std::dynamic_pointer_cast<MKLDNNRNN>(node);
return rnnNode && !rnnNode->hasNativeOrder() && node->outputShapes[0].getRank() == 4 && node->outputShapes[0].getDims()[1] == 1;
};
for (int i = 0; i < graphNodes.size(); i++) {
auto& parentNode = graphNodes[i];
if (!isSutableParentNode(parentNode)) {
continue;
}
auto childrenEdges = parentNode->getChildEdgesAtPort(0);
std::vector<ov::Dimension> origShape = static_cast<std::vector<ov::Dimension>>(parentNode->getOutputShapeAtPort(0).toPartialShape());
origShape.erase(origShape.begin() + 1);
const auto newShape = Shape(origShape);
parentNode->outputShapes[0] = newShape;
for (size_t i = 0; i < childrenEdges.size(); i++) {
auto edge = childrenEdges[i];
auto childNode = edge->getChild();
const auto secondInput = std::make_shared<ngraph::opset1::Constant>(ov::element::i32, ngraph::Shape{1}, std::vector<int>{1});
const auto unsqueeze = std::make_shared<ngraph::opset1::Unsqueeze>(
std::make_shared<ngraph::opset1::Parameter>(details::convertPrecision(parentNode->getOriginalOutputPrecisionAtPort(0)),
parentNode->getOutputShapeAtPort(0).toPartialShape()), secondInput);
unsqueeze->set_friendly_name(parentNode->getName() + "_abc_a1bc_" + std::to_string(i));
const auto cpuUnsqueeze = std::make_shared<MKLDNNReshapeNode>(unsqueeze, graph.getEngine(), graph.weightsCache);
graph.InsertNode(parentNode, childNode, cpuUnsqueeze, edge->getInputNum(), edge->getOutputNum(), false);
const auto cpuConstant = std::make_shared<MKLDNNInputNode>(secondInput, graph.getEngine(), graph.weightsCache);
MKLDNNEdgePtr newEdge(new MKLDNNEdge(cpuConstant, cpuUnsqueeze, 0, 1));
cpuUnsqueeze->addEdge(newEdge);
auto &graphEdges = graph.GetEdges();
graphEdges.push_back(newEdge);
graphNodes.push_back(cpuConstant);
edge->drop();
graph.RemoveEdge(edge);
}
}
}