[CPU] Execute constants in order with the create primitives calls (#16795)
This commit is contained in:
@@ -386,7 +386,7 @@ void Graph::InitGraph() {
|
||||
if (node->isDynamicNode()) {
|
||||
haveDynNodes = true;
|
||||
if (node->outputShapeDataDependency() ||
|
||||
// WA: for convolution plus summ(broadcast). Due to the fact that a convolution with sum use the same memory for second sum term and the output
|
||||
// WA: for convolution plus sum(broadcast). Due to the fact that a convolution with sum use the same memory for second sum term and the output
|
||||
// tensors (inPlace) resizing the output tensor, may lead to reallocation of this second term memory and possible data lost. The reallocation
|
||||
// may happen when the second term shape is broadcasted to the output tensor shape. To avoid the data loss, we have a special processing for
|
||||
// such cases inside the convolution node, but it works properly only when dynamic shapes inference, preparation and execution a called
|
||||
@@ -398,25 +398,25 @@ void Graph::InitGraph() {
|
||||
}
|
||||
|
||||
// In case of dynamic shapes, tensors may be resized due to the shapes variations.
|
||||
// If the input tensor is included to memory reuse, that means its memory manager is shared with other tensors in the graph, which in turn may cause data
|
||||
// loss when one of the tensor dow the graph requested mem resize, while the input data have not been yet read by the consumers. To avoid such situations
|
||||
// we disalbe io mem reuse for the case of dynamic shapes.
|
||||
// If the input tensor is included to memory reuse, it means that its memory manager is shared with other tensors in the graph, which in turn may cause data
|
||||
// loss when one of the tensors down the graph requests mem resize, while the input data have not been yet read by the consumers. To avoid such situations
|
||||
// we disable io mem reuse for the case of dynamic shapes.
|
||||
if (haveDynNodes) {
|
||||
this->reuse_io_tensors = false;
|
||||
}
|
||||
|
||||
Allocate();
|
||||
|
||||
CreatePrimitives();
|
||||
CreatePrimitivesAndExecConstants();
|
||||
|
||||
#ifndef CPU_DEBUG_CAPS
|
||||
for (auto &graphNode : graphNodes) {
|
||||
graphNode->cleanup();
|
||||
}
|
||||
#endif
|
||||
ExtractConstantAndExecutableNodes();
|
||||
|
||||
ExecuteConstantNodesOnly();
|
||||
ExtractExecutableNodes();
|
||||
|
||||
status = haveDynNodes ? Status::ReadyDynamic : Status::ReadyStatic;
|
||||
}
|
||||
|
||||
@@ -483,12 +483,10 @@ void Graph::InitOptimalPrimitiveDescriptors() {
|
||||
}
|
||||
}
|
||||
|
||||
void Graph::ExtractConstantAndExecutableNodes() {
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ExtractConstantAndExecutableNodes");
|
||||
void Graph::ExtractExecutableNodes() {
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ExtractExecutableNodes");
|
||||
for (const auto& graphNode : graphNodes) {
|
||||
if (graphNode->isConstant()) {
|
||||
constantGraphNodes.emplace_back(graphNode);
|
||||
} else if (CPU_DEBUG_CAPS_ALWAYS_TRUE(graphNode->isExecutable()) || graphNode->isDynamicNode()) {
|
||||
if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(graphNode->isExecutable())) || graphNode->isDynamicNode()) {
|
||||
/* @todo
|
||||
* Revise implementation.
|
||||
* With current way it is possible that with debug_caps enabled
|
||||
@@ -503,8 +501,8 @@ void Graph::ExtractConstantAndExecutableNodes() {
|
||||
}
|
||||
}
|
||||
|
||||
void Graph::ExecuteConstantNodesOnly() const {
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::ExecuteConstantNodesOnly");
|
||||
void Graph::CreatePrimitivesAndExecConstants() const {
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::CreatePrimitivesAndExecConstants");
|
||||
dnnl::stream stream(getEngine());
|
||||
|
||||
using shared_memory_ptr = WeightsSharing::SharedMemory::Ptr;
|
||||
@@ -531,7 +529,17 @@ void Graph::ExecuteConstantNodesOnly() const {
|
||||
return std::make_tuple(hasExternalInvalidEdges, hasLocalAllocatedEdges, outputs);
|
||||
};
|
||||
|
||||
for (const auto &node : constantGraphNodes) {
|
||||
for (const auto &node : graphNodes) {
|
||||
{
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.createPrimitive);
|
||||
DEBUG_LOG(*node);
|
||||
node->createPrimitive();
|
||||
}
|
||||
|
||||
if (!node->isConstant()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (context->getWeightsCache()) {
|
||||
auto sharedOutputs = acquireSharedOutputs(node);
|
||||
|
||||
@@ -884,15 +892,6 @@ void Graph::Allocate() {
|
||||
for (auto& edge : graphEdges) edge->validate();
|
||||
}
|
||||
|
||||
void Graph::CreatePrimitives() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, "Graph::CreatePrimitives");
|
||||
for (auto& node : graphNodes) {
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.createPrimitive);
|
||||
DEBUG_LOG(*node);
|
||||
node->createPrimitive();
|
||||
}
|
||||
}
|
||||
|
||||
void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob::Ptr &in) {
|
||||
if (!IsReady()) IE_THROW()<< "Wrong state. Topology not ready.";
|
||||
|
||||
|
||||
@@ -230,10 +230,9 @@ protected:
|
||||
void InitEdges();
|
||||
void Allocate();
|
||||
void AllocateWithReuse();
|
||||
void CreatePrimitives();
|
||||
void ExtractConstantAndExecutableNodes();
|
||||
void ExtractExecutableNodes();
|
||||
void ExecuteNode(const NodePtr& node, const dnnl::stream& stream) const;
|
||||
void ExecuteConstantNodesOnly() const;
|
||||
void CreatePrimitivesAndExecConstants() const;
|
||||
void InferStatic(InferRequestBase* request);
|
||||
void InferDynamic(InferRequestBase* request);
|
||||
|
||||
@@ -248,9 +247,8 @@ private:
|
||||
std::map<std::string, NodePtr> outputNodesMap;
|
||||
|
||||
// these node pointers (from graphNodes) are to avoid regular checking for
|
||||
// constantness of nodes in ExecuteConstantNodesOnly, Infer methods and calls of
|
||||
// constantness of nodes in Infer methods and calls of
|
||||
// non-executable (optimized out) nodes, such as Input, Reshape, etc.
|
||||
std::vector<NodePtr> constantGraphNodes;
|
||||
std::vector<NodePtr> executableGraphNodes;
|
||||
|
||||
std::unordered_map<Node*, size_t> syncNodesInds;
|
||||
|
||||
@@ -1501,7 +1501,7 @@ void Convolution::prepareParams() {
|
||||
auto it = primArgs.find(DNNL_ARG_WEIGHTS);
|
||||
if (it == primArgs.end() || !prevExecPtr ||
|
||||
!execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
|
||||
pendingConstWeightReorder = true;
|
||||
primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
|
||||
}
|
||||
} else {
|
||||
// non-const weight will be reordered by executor on every exec
|
||||
@@ -1558,11 +1558,6 @@ void Convolution::execute(dnnl::stream strm) {
|
||||
IE_THROW() << "Can't execute Convolution node with name: " << getName() << ", because executor is not compiled";
|
||||
}
|
||||
|
||||
if (pendingConstWeightReorder) {
|
||||
primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
|
||||
pendingConstWeightReorder = false;
|
||||
}
|
||||
|
||||
execPtr->exec(primArgs, strm);
|
||||
}
|
||||
|
||||
|
||||
@@ -98,7 +98,6 @@ private:
|
||||
const dnnl::engine& engine,
|
||||
bool constWeight);
|
||||
};
|
||||
bool pendingConstWeightReorder = false;
|
||||
|
||||
void prepareParams() override;
|
||||
void execute(dnnl::stream strm) override;
|
||||
|
||||
Reference in New Issue
Block a user