[CPU] Reduced overheads in inference stage (#6794)

2021-12-02 01:20:07 +03:00 · 2021-12-02 01:20:07 +03:00 · 802b5bcfbb
commit 802b5bcfbb
parent 3ab533a89c
4 changed files with 166 additions and 100 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -722,8 +722,13 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::
    auto input = inputNodesMap.find(name);
    if (input != inputNodesMap.end()) {
        auto& inTensorDesc = in->getTensorDesc();
        auto node = input->second;
        auto childEdge = node->getChildEdgeAt(0);
        const auto& outDims = node->getOutputShapeAtPort(0);
        const void *ext_data_ptr = in->cbuffer();
-        void *inter_data_ptr = input->second->getChildEdgeAt(0)->getMemory().GetData();
+        void *inter_data_ptr = childEdge->getMemory().GetData();
        if (ext_data_ptr != inter_data_ptr) {
            auto ext_tdesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(in->getTensorDesc());
@ -731,17 +736,16 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::
            auto ext_mem = MKLDNNMemory(eng);
            ext_mem.Create(ext_tdesc, ext_data_ptr, false);
-            input->second->getChildEdgeAt(0)->getMemory().SetData(ext_mem, 0, false);
+            childEdge->getMemory().SetData(ext_mem, 0, false);
        }
        // todo: make sure 'name' exists in this map...
        if (_normalizePreprocMap.find(name) != _normalizePreprocMap.end()) {
-            if (in->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
+            if (inTensorDesc.getPrecision() == InferenceEngine::Precision::FP32) {
-                _normalizePreprocMap[name].NormalizeImage(input->second->getOutputShapeAtPort(0),
+                _normalizePreprocMap[name].NormalizeImage(outDims, reinterpret_cast<float *>(inter_data_ptr),
-                                                          reinterpret_cast<float *>(inter_data_ptr),
+                                                          inTensorDesc.getLayout());
                                                          in->getTensorDesc().getLayout());
            } else {
-                IE_THROW() << "Mean image of type " << in->getTensorDesc().getPrecision().name() << " is unsupported";
+                IE_THROW() << "Mean image of type " << inTensorDesc.getPrecision().name() << " is unsupported";
            }
        }
    } else {
@ -756,15 +760,17 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
    for (auto &outputMap : outputNodesMap) {
        auto name = outputMap.first;
        auto node = outputMap.second;
-        const MKLDNNMemory& intr_blob = node->getParentEdgeAt(0)->getMemory();
+        auto parentEdge = node->getParentEdgeAt(0);
        const MKLDNNMemory& intr_blob = parentEdge->getMemory();
-        auto ext_blob = out.find(name);
+        const auto ext_blob_map = out.find(name);
-        if (ext_blob == out.end()) {
+        const auto ext_blob = ext_blob_map->second;
        if (ext_blob_map == out.end()) {
            IE_THROW(Unexpected) << "The network outputs do not contain mkldnn graph output node name: \"" << name << "\"";
        }
        const auto actualDesc = MemoryDescUtils::convertToTensorDesc(intr_blob.getDesc());
-        auto &expectedDesc = ext_blob->second->getTensorDesc();
+        auto &expectedDesc = ext_blob->getTensorDesc();
        // TODO [NM]: need to create universal reorder which will be detect cases when we really need to use it
        // WA: for cases when output shape after transformation will be 1x1x1x1 but model output is scalar
@ -797,27 +803,16 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
        auto srcPrec = actualDesc.getPrecision();
        auto dstPrec = expectedDesc.getPrecision();
-        if (srcPrec == dstPrec && ext_blob->second->byteSize() != intr_blob.GetSize())
+        if (srcPrec == dstPrec && ext_blob->byteSize() != intr_blob.GetSize())
                IE_THROW() << "Output blob byte size is not equal network output byte size ("
-                                   << ext_blob->second->byteSize() << "!=" << intr_blob.GetSize() << ").";
+                                   << ext_blob->byteSize() << "!=" << intr_blob.GetSize() << ").";
-        void *ext_blob_ptr = ext_blob->second->buffer();
+        void *ext_blob_ptr = ext_blob->buffer();
        void *intr_blob_ptr = intr_blob.GetData();
        // That is the same memory. No need to copy
        if (ext_blob_ptr == intr_blob_ptr) continue;
        size_t size_to_copy = intr_blob.GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();
        // TODO: Should we support InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_LIMIT???
        // TODO [DS]: phase 2: should we support this behaviour? Looks obsolete in the dynamic shapes paradigm
        if (config.batchLimit) {
            if (node->isDynamicNode()) {
                IE_THROW(NotImplemented) << "[DS] not implemented dynamic batch for node with dynamic shape";
            }
            int MB_to_process = node->batchToProcess();
            size_to_copy = std::accumulate(outDims.begin() + 1, outDims.end(), (size_t)1, std::multiplies<size_t>()) * MB_to_process;
        }
        if (actualDesc.getBlockingDesc() != expectedDesc.getBlockingDesc() && !isScalarOutput) {
            auto outBlobDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(expectedDesc);
            auto outBloMem = MKLDNNMemory(eng);
@ -825,6 +820,17 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
            outBloMem.SetData(intr_blob, 0, false);
        } else {
            size_t size_to_copy = intr_blob.GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();
            // TODO: Should we support InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_LIMIT???
            // TODO [DS]: phase 2: should we support this behaviour? Looks obsolete in the dynamic shapes paradigm
            if (config.batchLimit) {
                if (node->isDynamicNode()) {
                    IE_THROW(NotImplemented) << "[DS] not implemented dynamic batch for node with dynamic shape";
                }
                int MB_to_process = node->batchToProcess();
                size_to_copy = std::accumulate(outDims.begin() + 1, outDims.end(), (size_t)1, std::multiplies<size_t>()) * MB_to_process;
            }
            cpu_convert(intr_blob_ptr, ext_blob_ptr, srcPrec, dstPrec, size_to_copy);
        }
    }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@ -84,27 +84,27 @@ MKLDNNPlugin::MKLDNNInferRequest::~MKLDNNInferRequest() {
 }
 void MKLDNNPlugin::MKLDNNInferRequest::pushInput(const std::string& inputName, InferenceEngine::Blob::Ptr& inputBlob, InferenceEngine::Precision inPrec) {
-    bool needConvert = inPrec != inputBlob->getTensorDesc().getPrecision();
+    auto& tensorDesc = inputBlob->getTensorDesc();
    bool needConvert = inPrec != tensorDesc.getPrecision();
-    if (inputBlob->cbuffer().as<const void *>() == nullptr) {
+    const void* srcData = inputBlob->cbuffer().as<const void *>();
    if (srcData == nullptr) {
        IE_THROW() << "Input blob has no allocated memory";
    }
    InferenceEngine::Blob::Ptr iconv;
    if (needConvert) {
-        iconv = make_blob_with_precision(inPrec, InferenceEngine::TensorDesc(inPrec, inputBlob->getTensorDesc().getDims(),
+        iconv = make_blob_with_precision(inPrec, InferenceEngine::TensorDesc(inPrec, tensorDesc.getDims(), tensorDesc.getLayout()));
                                         inputBlob->getTensorDesc().getLayout()));
        iconv->allocate();
        if (inputBlob->size() != iconv->size())
            IE_THROW() << "Can't copy tensor: input and converted tensors have different number of elements: " << inputBlob->size() << " and "
                               << iconv->size();
        void *srcData = inputBlob->cbuffer().as<void *>();
        void *dstData = iconv->buffer().as<void *>();
        if (dstData == nullptr) {
            IE_THROW() << "Converted input blob has no allocated memory";
        }
-        cpu_convert(srcData, dstData, inputBlob->getTensorDesc().getPrecision(), iconv->getTensorDesc().getPrecision(), iconv->size());
+        cpu_convert(srcData, dstData, tensorDesc.getPrecision(), iconv->getTensorDesc().getPrecision(), iconv->size());
    }
    graph->PushInputData(inputName, needConvert ? iconv : inputBlob);
@ -112,27 +112,30 @@ void MKLDNNPlugin::MKLDNNInferRequest::pushInput(const std::string& inputName, I
 void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
    for (auto input : _inputs) {
-        if (!_networkInputs[input.first]) {
+        auto inputName = input.first;
-            IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << input.first;
+        if (!_networkInputs[inputName]) {
            IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << inputName;
        }
-        auto inPrec = input.second->getTensorDesc().getPrecision();
+        auto inputBlob = input.second;
-        if (graph->hasMeanImageFor(input.first) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
+        auto& inputTensorDesc = inputBlob->getTensorDesc();
        auto inPrec = inputTensorDesc.getPrecision();
        if (graph->hasMeanImageFor(inputName) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
            inPrec = InferenceEngine::Precision::FP32;
        } else {
            inPrec = normalizeToSupportedPrecision(inPrec);
        }
        if (inPrec == InferenceEngine::Precision::UNSPECIFIED) {
-            IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
+            IE_THROW() << "Unsupported input precision " << inputTensorDesc.getPrecision();
        }
        // User can initialize input via setBlob API using tensorDesc with default (ANY) layout.
        // Currently IE doesn't specify behavior in such scenario, so we assume real layout is equal to the network input.
-        if (input.second->getTensorDesc().getLayout() == InferenceEngine::ANY) {
+        if (inputTensorDesc.getLayout() == InferenceEngine::ANY) {
-            input.second->getTensorDesc().setLayout(_networkInputs[input.first]->getLayout());
+            inputTensorDesc.setLayout(_networkInputs[inputName]->getLayout());
        }
-        pushInput(input.first, input.second, inPrec);
+        pushInput(inputName, inputBlob, inPrec);
    }
 }
@ -502,71 +505,104 @@ static inline void changeEdgePtr(const MKLDNNPlugin::MKLDNNEdgePtr &edge, void *
 void MKLDNNPlugin::MKLDNNInferRequest::changeDefaultPtr() {
    for (auto& it : externalPtr) {
-        auto input = graph->GetInputNodesMap().find(it.first);
+        const auto& inputNodesMap = graph->GetInputNodesMap();
-        if (input != graph->GetInputNodesMap().end()) {
+        auto input = inputNodesMap.find(it.first);
-            if (input->second->getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
+        if (input != inputNodesMap.end()) {
            MKLDNNNodePtr inputNodePtr = input->second;
            if (inputNodePtr->getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
                continue;
            auto& childEdges = inputNodePtr->getChildEdges();
            // Input cannot be in-place with other primitives
            bool canBeInPlace = true;
-            for (size_t i = 0; canBeInPlace && i < input->second->getChildEdges().size(); i++) {
+            for (auto& childEdge : childEdges) {
-                auto& child = input->second->getChildEdgeAt(i)->getChild();
+                auto ce = childEdge.lock();
-                if (child->isConstant())
+                if (!ce)
-                    canBeInPlace = false;
+                    IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";
-                auto* concat = dynamic_cast<MKLDNNConcatNode *>(child.get());
+                auto& child = ce->getChild();
                if (canBeInPlace && concat && concat->isOptimized())
                    canBeInPlace = false;
-                // Cannot be in-place before split because split is using different ptrs without offsets
+                if (child->isConstant()) {
                auto* split = dynamic_cast<MKLDNNSplitNode *>(child.get());
                if (canBeInPlace && split)
                    canBeInPlace = false;
                if (child->isInplace())
                    canBeInPlace = false;
                for (size_t j = 0; canBeInPlace && j < child->getChildEdges().size(); j++) {
                    if (child->getChildEdgeAt(j)->getMemory().GetPrimitive().get_data_handle() ==
                            input->second->getChildEdgeAt(i)->getMemory().GetPrimitive().get_data_handle())
                        canBeInPlace = false;
                }
            }
            for (size_t i = 0; canBeInPlace && i < input->second->getChildEdges().size(); i++) {
                changeEdgePtr(input->second->getChildEdgeAt(i), it.second);
            }
            continue;
        }
        MKLDNNNodePtr output;
        for (auto& out : graph->GetOutputNodesMap()) {
            if (out.first == it.first) {
                output = out.second;
                break;
            }
        }
        if (output) {
            if (output->getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
                continue;
            bool canBeInPlace = true;
            void * defaultPtr = output->getParentEdgeAt(0)->getMemory().GetPrimitivePtr()->get_data_handle();
            // Cannot be in-place after concat because concat is using different ptrs without offsets
            auto parent = output->getParentEdgeAt(0)->getParent();
            MKLDNNNodePtr previousParent;
            do {
                previousParent = parent;
                if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInplace()) {
                    canBeInPlace = false;
                    break;
                }
-                for (size_t i = 0; i < parent->getParentEdges().size(); i++) {
+                if (child->getType() == Concatenation && dynamic_cast<MKLDNNConcatNode*>(child.get())->isOptimized()) {
-                    if (parent->getParentEdgeAt(i)->getMemory().GetPrimitivePtr()->get_data_handle() == defaultPtr) {
+                    canBeInPlace = false;
-                        parent = parent->getParentEdgeAt(i)->getParent();
+                    break;
                }
                // Cannot be in-place before split because split is using different ptrs without offsets
                if (child->getType() == Split) {
                    canBeInPlace = false;
                    break;
                }
                if (child->isInPlace()) {
                    canBeInPlace = false;
                    break;
                }
                auto& edges = child->getChildEdges();
                for (auto& edge : edges) {
                    auto e = edge.lock();
                    if (!e)
                        IE_THROW() << "Node " << child->getName() << " contains empty child edge";
                    if (e->getMemory().GetPrimitive().get_data_handle() == ce->getMemory().GetPrimitive().get_data_handle()) {
                        canBeInPlace = false;
                        break;
                    }
                }
                if (!canBeInPlace)
                    break;
            }
            if (canBeInPlace) {
                for (auto& edge : childEdges) {
                    auto e = edge.lock();
                    if (!e)
                        IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";
                    changeEdgePtr(e, it.second);
                }
            }
            continue;
        }
        const auto& outputNodesMap = graph->GetOutputNodesMap();
        auto output = outputNodesMap.find(it.first);
        if (output != outputNodesMap.end()) {
            auto parentEdge = output->second->getParentEdgeAt(0);
            if (parentEdge->getMemory().GetPrimitive().get_data_handle() == it.second)
                continue;
            bool canBeInPlace = true;
            void* defaultPtr = parentEdge->getMemory().GetPrimitivePtr()->get_data_handle();
            // Cannot be in-place after concat because concat is using different ptrs without offsets
            auto parent = parentEdge->getParent();
            MKLDNNNodePtr previousParent;
            do {
                previousParent = parent;
                if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInPlace()) {
                    canBeInPlace = false;
                    break;
                }
                auto& parentEdges = parent->getParentEdges();
                for (auto& edge : parentEdges) {
                    auto e = edge.lock();
                    if (!e)
                        IE_THROW() << "Node " << parent->getName() << " contains empty parent edge";
                    if (e->getMemory().GetPrimitivePtr()->get_data_handle() == defaultPtr) {
                        parent = e->getParent();
                        break;
                    }
                }
            } while (previousParent != parent);
            if (canBeInPlace)
-                changeEdgePtr(output->getParentEdgeAt(0), it.second);
+                changeEdgePtr(parentEdge, it.second);
            continue;
        }
        IE_THROW() << "Cannot find input/output blob: " << it.first;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -770,15 +770,29 @@ void MKLDNNNode::prepareMemory(const NodeDesc *selected_pd, mkldnn::primitive_de
    }
 }
-bool MKLDNNNode::isInplace() const {
+bool MKLDNNNode::isInPlace() {
-    auto selected_pd = getSelectedPrimitiveDescriptor();
+    if (inplace == InPlaceType::Unknown) {
-    if (selected_pd == nullptr)
+        auto selected_pd = getSelectedPrimitiveDescriptor();
-        IE_THROW() << "Preferable primitive descriptor is not set.";
+        if (selected_pd == nullptr)
-    auto config = selected_pd->getConfig();
+            IE_THROW() << "Preferable primitive descriptor is not set.";
-    for (auto &in : config.inConfs) if (in.inPlace >= 0) return true;
+        inplace = InPlaceType::NoInPlace;
-    for (auto &out : config.outConfs) if (out.inPlace >= 0) return true;
+        auto config = selected_pd->getConfig();
-    return false;
+        for (auto &in : config.inConfs) {
            if (in.inPlace >= 0) {
                inplace = InPlaceType::InPlace;
                break;
            }
        }
        for (auto &out : config.outConfs) {
            if (out.inPlace >= 0) {
                inplace = InPlaceType::InPlace;
                break;
            }
        }
    }
    return inplace == InPlaceType::InPlace;
 }
 bool MKLDNNNode::isConstant() {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -195,6 +195,8 @@ public:
        return engine;
    }
    bool isInPlace();
    // must be called only after MKLDNNGraph::InitEdges()
    virtual bool isExecutable() const {
        return true;
@ -202,8 +204,6 @@ public:
    bool isConstant();
    bool isInplace() const;
    bool isFusedWith(Type type) const;
    void addFusedNode(const MKLDNNNodePtr &fusingNode) {
@ -336,6 +336,10 @@ public:
            selectedPrimitiveDescriptorIndex = -1;
        else
            selectedPrimitiveDescriptorIndex = index;
        // Each primitive descriptor has its own InPlace status. So after new primitive descriptor selection
        // we should reset InPlace type to definite new status for node using MKLDNNNode::isInPlace()
        inplace = InPlaceType::Unknown;
    }
    std::string getPrimitiveDescriptorType();
@ -616,11 +620,17 @@ protected:
    bool permanent = false;
    bool temporary = false;
    int dynBatchLim = 0;
    enum class InPlaceType {
        Unknown,
        InPlace,
        NoInPlace
    };
    enum class ConstantType {
        Unknown,
        Const,
        NoConst
    };
    InPlaceType inplace = InPlaceType::Unknown;
    ConstantType constant = ConstantType::Unknown;
    std::vector<InferenceEngine::Blob::Ptr> internalBlobs;
    std::vector<MKLDNNMemoryPtr> internalBlobMemory;