[CPU] Reduced overheads in inference stage (#6794)

2021-12-02 01:20:07 +03:00 · 2021-12-02 01:20:07 +03:00 · 802b5bcfbb
commit 802b5bcfbb
parent 3ab533a89c
4 changed files with 166 additions and 100 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -722,8 +722,13 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::

    auto input = inputNodesMap.find(name);
    if (input != inputNodesMap.end()) {
+        auto& inTensorDesc = in->getTensorDesc();
+        auto node = input->second;
+        auto childEdge = node->getChildEdgeAt(0);
+        const auto& outDims = node->getOutputShapeAtPort(0);
+
        const void *ext_data_ptr = in->cbuffer();
-        void *inter_data_ptr = input->second->getChildEdgeAt(0)->getMemory().GetData();
+        void *inter_data_ptr = childEdge->getMemory().GetData();

        if (ext_data_ptr != inter_data_ptr) {
            auto ext_tdesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(in->getTensorDesc());
@ -731,17 +736,16 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::
            auto ext_mem = MKLDNNMemory(eng);
            ext_mem.Create(ext_tdesc, ext_data_ptr, false);

-            input->second->getChildEdgeAt(0)->getMemory().SetData(ext_mem, 0, false);
+            childEdge->getMemory().SetData(ext_mem, 0, false);
        }

        // todo: make sure 'name' exists in this map...
        if (_normalizePreprocMap.find(name) != _normalizePreprocMap.end()) {
-            if (in->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
-                _normalizePreprocMap[name].NormalizeImage(input->second->getOutputShapeAtPort(0),
-                                                          reinterpret_cast<float *>(inter_data_ptr),
-                                                          in->getTensorDesc().getLayout());
+            if (inTensorDesc.getPrecision() == InferenceEngine::Precision::FP32) {
+                _normalizePreprocMap[name].NormalizeImage(outDims, reinterpret_cast<float *>(inter_data_ptr),
+                                                          inTensorDesc.getLayout());
            } else {
-                IE_THROW() << "Mean image of type " << in->getTensorDesc().getPrecision().name() << " is unsupported";
+                IE_THROW() << "Mean image of type " << inTensorDesc.getPrecision().name() << " is unsupported";
            }
        }
    } else {
@ -756,15 +760,17 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
    for (auto &outputMap : outputNodesMap) {
        auto name = outputMap.first;
        auto node = outputMap.second;
-        const MKLDNNMemory& intr_blob = node->getParentEdgeAt(0)->getMemory();
+        auto parentEdge = node->getParentEdgeAt(0);
+        const MKLDNNMemory& intr_blob = parentEdge->getMemory();

-        auto ext_blob = out.find(name);
-        if (ext_blob == out.end()) {
+        const auto ext_blob_map = out.find(name);
+        const auto ext_blob = ext_blob_map->second;
+        if (ext_blob_map == out.end()) {
            IE_THROW(Unexpected) << "The network outputs do not contain mkldnn graph output node name: \"" << name << "\"";
        }

        const auto actualDesc = MemoryDescUtils::convertToTensorDesc(intr_blob.getDesc());
-        auto &expectedDesc = ext_blob->second->getTensorDesc();
+        auto &expectedDesc = ext_blob->getTensorDesc();

        // TODO [NM]: need to create universal reorder which will be detect cases when we really need to use it
        // WA: for cases when output shape after transformation will be 1x1x1x1 but model output is scalar
@ -797,27 +803,16 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
        auto srcPrec = actualDesc.getPrecision();
        auto dstPrec = expectedDesc.getPrecision();

-        if (srcPrec == dstPrec && ext_blob->second->byteSize() != intr_blob.GetSize())
+        if (srcPrec == dstPrec && ext_blob->byteSize() != intr_blob.GetSize())
                IE_THROW() << "Output blob byte size is not equal network output byte size ("
-                                   << ext_blob->second->byteSize() << "!=" << intr_blob.GetSize() << ").";
+                                   << ext_blob->byteSize() << "!=" << intr_blob.GetSize() << ").";

-        void *ext_blob_ptr = ext_blob->second->buffer();
+        void *ext_blob_ptr = ext_blob->buffer();
        void *intr_blob_ptr = intr_blob.GetData();

        // That is the same memory. No need to copy
        if (ext_blob_ptr == intr_blob_ptr) continue;

-        size_t size_to_copy = intr_blob.GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();
-        // TODO: Should we support InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_LIMIT???
-        // TODO [DS]: phase 2: should we support this behaviour? Looks obsolete in the dynamic shapes paradigm
-        if (config.batchLimit) {
-            if (node->isDynamicNode()) {
-                IE_THROW(NotImplemented) << "[DS] not implemented dynamic batch for node with dynamic shape";
-            }
-            int MB_to_process = node->batchToProcess();
-            size_to_copy = std::accumulate(outDims.begin() + 1, outDims.end(), (size_t)1, std::multiplies<size_t>()) * MB_to_process;
-        }
-
        if (actualDesc.getBlockingDesc() != expectedDesc.getBlockingDesc() && !isScalarOutput) {
            auto outBlobDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(expectedDesc);
            auto outBloMem = MKLDNNMemory(eng);
@ -825,6 +820,17 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {

            outBloMem.SetData(intr_blob, 0, false);
        } else {
+            size_t size_to_copy = intr_blob.GetDescWithType<BlockedMemoryDesc>()->getPaddedElementsCount();
+            // TODO: Should we support InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_LIMIT???
+            // TODO [DS]: phase 2: should we support this behaviour? Looks obsolete in the dynamic shapes paradigm
+            if (config.batchLimit) {
+                if (node->isDynamicNode()) {
+                    IE_THROW(NotImplemented) << "[DS] not implemented dynamic batch for node with dynamic shape";
+                }
+                int MB_to_process = node->batchToProcess();
+                size_to_copy = std::accumulate(outDims.begin() + 1, outDims.end(), (size_t)1, std::multiplies<size_t>()) * MB_to_process;
+            }
+
            cpu_convert(intr_blob_ptr, ext_blob_ptr, srcPrec, dstPrec, size_to_copy);
        }
    }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@ -84,27 +84,27 @@ MKLDNNPlugin::MKLDNNInferRequest::~MKLDNNInferRequest() {
 }

 void MKLDNNPlugin::MKLDNNInferRequest::pushInput(const std::string& inputName, InferenceEngine::Blob::Ptr& inputBlob, InferenceEngine::Precision inPrec) {
-    bool needConvert = inPrec != inputBlob->getTensorDesc().getPrecision();
+    auto& tensorDesc = inputBlob->getTensorDesc();
+    bool needConvert = inPrec != tensorDesc.getPrecision();

-    if (inputBlob->cbuffer().as<const void *>() == nullptr) {
+    const void* srcData = inputBlob->cbuffer().as<const void *>();
+    if (srcData == nullptr) {
        IE_THROW() << "Input blob has no allocated memory";
    }

    InferenceEngine::Blob::Ptr iconv;
    if (needConvert) {
-        iconv = make_blob_with_precision(inPrec, InferenceEngine::TensorDesc(inPrec, inputBlob->getTensorDesc().getDims(),
-                                         inputBlob->getTensorDesc().getLayout()));
+        iconv = make_blob_with_precision(inPrec, InferenceEngine::TensorDesc(inPrec, tensorDesc.getDims(), tensorDesc.getLayout()));
        iconv->allocate();
        if (inputBlob->size() != iconv->size())
            IE_THROW() << "Can't copy tensor: input and converted tensors have different number of elements: " << inputBlob->size() << " and "
                               << iconv->size();

-        void *srcData = inputBlob->cbuffer().as<void *>();
        void *dstData = iconv->buffer().as<void *>();
        if (dstData == nullptr) {
            IE_THROW() << "Converted input blob has no allocated memory";
        }
-        cpu_convert(srcData, dstData, inputBlob->getTensorDesc().getPrecision(), iconv->getTensorDesc().getPrecision(), iconv->size());
+        cpu_convert(srcData, dstData, tensorDesc.getPrecision(), iconv->getTensorDesc().getPrecision(), iconv->size());
    }

    graph->PushInputData(inputName, needConvert ? iconv : inputBlob);
@ -112,27 +112,30 @@ void MKLDNNPlugin::MKLDNNInferRequest::pushInput(const std::string& inputName, I

 void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
    for (auto input : _inputs) {
-        if (!_networkInputs[input.first]) {
-            IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << input.first;
+        auto inputName = input.first;
+        if (!_networkInputs[inputName]) {
+            IE_THROW() << "Input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name " << inputName;
        }
-        auto inPrec = input.second->getTensorDesc().getPrecision();
-        if (graph->hasMeanImageFor(input.first) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
+        auto inputBlob = input.second;
+        auto& inputTensorDesc = inputBlob->getTensorDesc();
+        auto inPrec = inputTensorDesc.getPrecision();
+        if (graph->hasMeanImageFor(inputName) && one_of(inPrec, InferenceEngine::Precision::U8, InferenceEngine::Precision::BOOL)) {
            inPrec = InferenceEngine::Precision::FP32;
        } else {
            inPrec = normalizeToSupportedPrecision(inPrec);
        }

        if (inPrec == InferenceEngine::Precision::UNSPECIFIED) {
-            IE_THROW() << "Unsupported input precision " << input.second->getTensorDesc().getPrecision();
+            IE_THROW() << "Unsupported input precision " << inputTensorDesc.getPrecision();
        }

        // User can initialize input via setBlob API using tensorDesc with default (ANY) layout.
        // Currently IE doesn't specify behavior in such scenario, so we assume real layout is equal to the network input.
-        if (input.second->getTensorDesc().getLayout() == InferenceEngine::ANY) {
-            input.second->getTensorDesc().setLayout(_networkInputs[input.first]->getLayout());
+        if (inputTensorDesc.getLayout() == InferenceEngine::ANY) {
+            inputTensorDesc.setLayout(_networkInputs[inputName]->getLayout());
        }

-        pushInput(input.first, input.second, inPrec);
+        pushInput(inputName, inputBlob, inPrec);
    }
 }

@ -502,71 +505,104 @@ static inline void changeEdgePtr(const MKLDNNPlugin::MKLDNNEdgePtr &edge, void *

 void MKLDNNPlugin::MKLDNNInferRequest::changeDefaultPtr() {
    for (auto& it : externalPtr) {
-        auto input = graph->GetInputNodesMap().find(it.first);
-        if (input != graph->GetInputNodesMap().end()) {
-            if (input->second->getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
+        const auto& inputNodesMap = graph->GetInputNodesMap();
+        auto input = inputNodesMap.find(it.first);
+        if (input != inputNodesMap.end()) {
+            MKLDNNNodePtr inputNodePtr = input->second;
+            if (inputNodePtr->getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
                continue;
+            auto& childEdges = inputNodePtr->getChildEdges();
            // Input cannot be in-place with other primitives
            bool canBeInPlace = true;
-            for (size_t i = 0; canBeInPlace && i < input->second->getChildEdges().size(); i++) {
-                auto& child = input->second->getChildEdgeAt(i)->getChild();
-                if (child->isConstant())
-                    canBeInPlace = false;
+            for (auto& childEdge : childEdges) {
+                auto ce = childEdge.lock();
+                if (!ce)
+                    IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";

-                auto* concat = dynamic_cast<MKLDNNConcatNode *>(child.get());
-                if (canBeInPlace && concat && concat->isOptimized())
-                    canBeInPlace = false;
+                auto& child = ce->getChild();

-                // Cannot be in-place before split because split is using different ptrs without offsets
-                auto* split = dynamic_cast<MKLDNNSplitNode *>(child.get());
-                if (canBeInPlace && split)
-                    canBeInPlace = false;
-
-                if (child->isInplace())
-                    canBeInPlace = false;
-                for (size_t j = 0; canBeInPlace && j < child->getChildEdges().size(); j++) {
-                    if (child->getChildEdgeAt(j)->getMemory().GetPrimitive().get_data_handle() ==
-                            input->second->getChildEdgeAt(i)->getMemory().GetPrimitive().get_data_handle())
-                        canBeInPlace = false;
-                }
-            }
-            for (size_t i = 0; canBeInPlace && i < input->second->getChildEdges().size(); i++) {
-                changeEdgePtr(input->second->getChildEdgeAt(i), it.second);
-            }
-            continue;
-        }
-
-        MKLDNNNodePtr output;
-        for (auto& out : graph->GetOutputNodesMap()) {
-            if (out.first == it.first) {
-                output = out.second;
-                break;
-            }
-        }
-        if (output) {
-            if (output->getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle() == it.second)
-                continue;
-            bool canBeInPlace = true;
-            void * defaultPtr = output->getParentEdgeAt(0)->getMemory().GetPrimitivePtr()->get_data_handle();
-            // Cannot be in-place after concat because concat is using different ptrs without offsets
-            auto parent = output->getParentEdgeAt(0)->getParent();
-            MKLDNNNodePtr previousParent;
-            do {
-                previousParent = parent;
-                if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInplace()) {
+                if (child->isConstant()) {
                    canBeInPlace = false;
                    break;
                }

-                for (size_t i = 0; i < parent->getParentEdges().size(); i++) {
-                    if (parent->getParentEdgeAt(i)->getMemory().GetPrimitivePtr()->get_data_handle() == defaultPtr) {
-                        parent = parent->getParentEdgeAt(i)->getParent();
+                if (child->getType() == Concatenation && dynamic_cast<MKLDNNConcatNode*>(child.get())->isOptimized()) {
+                    canBeInPlace = false;
+                    break;
+                }
+
+                // Cannot be in-place before split because split is using different ptrs without offsets
+                if (child->getType() == Split) {
+                    canBeInPlace = false;
+                    break;
+                }
+
+                if (child->isInPlace()) {
+                    canBeInPlace = false;
+                    break;
+                }
+
+                auto& edges = child->getChildEdges();
+                for (auto& edge : edges) {
+                    auto e = edge.lock();
+                    if (!e)
+                        IE_THROW() << "Node " << child->getName() << " contains empty child edge";
+
+                    if (e->getMemory().GetPrimitive().get_data_handle() == ce->getMemory().GetPrimitive().get_data_handle()) {
+                        canBeInPlace = false;
+                        break;
+                    }
+                }
+
+                if (!canBeInPlace)
+                    break;
+            }
+            if (canBeInPlace) {
+                for (auto& edge : childEdges) {
+                    auto e = edge.lock();
+                    if (!e)
+                        IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";
+
+                    changeEdgePtr(e, it.second);
+                }
+            }
+
+            continue;
+        }
+
+        const auto& outputNodesMap = graph->GetOutputNodesMap();
+        auto output = outputNodesMap.find(it.first);
+        if (output != outputNodesMap.end()) {
+            auto parentEdge = output->second->getParentEdgeAt(0);
+            if (parentEdge->getMemory().GetPrimitive().get_data_handle() == it.second)
+                continue;
+
+            bool canBeInPlace = true;
+            void* defaultPtr = parentEdge->getMemory().GetPrimitivePtr()->get_data_handle();
+            // Cannot be in-place after concat because concat is using different ptrs without offsets
+            auto parent = parentEdge->getParent();
+            MKLDNNNodePtr previousParent;
+            do {
+                previousParent = parent;
+                if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInPlace()) {
+                    canBeInPlace = false;
+                    break;
+                }
+
+                auto& parentEdges = parent->getParentEdges();
+                for (auto& edge : parentEdges) {
+                    auto e = edge.lock();
+                    if (!e)
+                        IE_THROW() << "Node " << parent->getName() << " contains empty parent edge";
+
+                    if (e->getMemory().GetPrimitivePtr()->get_data_handle() == defaultPtr) {
+                        parent = e->getParent();
                        break;
                    }
                }
            } while (previousParent != parent);
            if (canBeInPlace)
-                changeEdgePtr(output->getParentEdgeAt(0), it.second);
+                changeEdgePtr(parentEdge, it.second);
            continue;
        }
        IE_THROW() << "Cannot find input/output blob: " << it.first;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -770,15 +770,29 @@ void MKLDNNNode::prepareMemory(const NodeDesc *selected_pd, mkldnn::primitive_de
    }
 }

-bool MKLDNNNode::isInplace() const {
-    auto selected_pd = getSelectedPrimitiveDescriptor();
-    if (selected_pd == nullptr)
-        IE_THROW() << "Preferable primitive descriptor is not set.";
-    auto config = selected_pd->getConfig();
+bool MKLDNNNode::isInPlace() {
+    if (inplace == InPlaceType::Unknown) {
+        auto selected_pd = getSelectedPrimitiveDescriptor();
+        if (selected_pd == nullptr)
+            IE_THROW() << "Preferable primitive descriptor is not set.";

-    for (auto &in : config.inConfs) if (in.inPlace >= 0) return true;
-    for (auto &out : config.outConfs) if (out.inPlace >= 0) return true;
-    return false;
+        inplace = InPlaceType::NoInPlace;
+        auto config = selected_pd->getConfig();
+        for (auto &in : config.inConfs) {
+            if (in.inPlace >= 0) {
+                inplace = InPlaceType::InPlace;
+                break;
+            }
+        }
+        for (auto &out : config.outConfs) {
+            if (out.inPlace >= 0) {
+                inplace = InPlaceType::InPlace;
+                break;
+            }
+        }
+    }
+
+    return inplace == InPlaceType::InPlace;
 }

 bool MKLDNNNode::isConstant() {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -195,6 +195,8 @@ public:
        return engine;
    }

+    bool isInPlace();
+
    // must be called only after MKLDNNGraph::InitEdges()
    virtual bool isExecutable() const {
        return true;
@ -202,8 +204,6 @@ public:

    bool isConstant();

-    bool isInplace() const;
-
    bool isFusedWith(Type type) const;

    void addFusedNode(const MKLDNNNodePtr &fusingNode) {
@ -336,6 +336,10 @@ public:
            selectedPrimitiveDescriptorIndex = -1;
        else
            selectedPrimitiveDescriptorIndex = index;
+
+        // Each primitive descriptor has its own InPlace status. So after new primitive descriptor selection
+        // we should reset InPlace type to definite new status for node using MKLDNNNode::isInPlace()
+        inplace = InPlaceType::Unknown;
    }

    std::string getPrimitiveDescriptorType();
@ -616,11 +620,17 @@ protected:
    bool permanent = false;
    bool temporary = false;
    int dynBatchLim = 0;
+    enum class InPlaceType {
+        Unknown,
+        InPlace,
+        NoInPlace
+    };
    enum class ConstantType {
        Unknown,
        Const,
        NoConst
    };
+    InPlaceType inplace = InPlaceType::Unknown;
    ConstantType constant = ConstantType::Unknown;
    std::vector<InferenceEngine::Blob::Ptr> internalBlobs;
    std::vector<MKLDNNMemoryPtr> internalBlobMemory;