Constant nodes outputs sharing between streams (#4561)

2021-03-03 16:57:15 +03:00 · 2021-03-03 16:57:15 +03:00 · e0573e7e7b
commit e0573e7e7b
parent 430adbc191
8 changed files with 253 additions and 86 deletions
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
@ -27,6 +27,10 @@ const MKLDNNNodePtr MKLDNNEdge::getChild() const {
    return childPtr;
 }

+bool MKLDNNEdge::isUseExternalMemory() const {
+    return externalMemoryPtr;
+}
+
 bool MKLDNNEdge::isDropped() {
    bool not_in_parent = true;
    bool not_in_child = true;
@ -164,6 +168,31 @@ void MKLDNNEdge::allocate(const void* mem_ptr) {
    status = Status::Allocated;
 }

+std::string MKLDNNEdge::name() const {
+    auto childPtr = getChild();
+    auto parentPtr = getParent();
+    return childPtr->getName() + "<->" + parentPtr->getName();
+}
+
+void MKLDNNEdge::externalAllocate(MKLDNNWeightsSharing::Ptr weightsCache) {
+    if (status != Status::NeedAllocation)
+        return;
+
+    if (weightsCache) {
+        auto alloc = [this] () {
+            allocate();
+            return memoryPtr;
+        };
+
+        auto ptr = weightsCache->findOrCreate(name(), alloc, false);
+        memoryPtr = *ptr;
+        externalMemoryPtr = true;
+        status = Status::Allocated;
+    } else {
+        allocate();
+    }
+}
+
 void MKLDNNEdge::changeStatus(MKLDNNEdge::Status state) {
    if (state == Status::NotAllocated) {
        THROW_IE_EXCEPTION << "Incorrect behaviour! Use method sharedMemFrom()";
@ -570,6 +599,7 @@ void MKLDNNEdge::validate() {
    getParent();
    getChild();
    getDims();
+
    if (status != Status::Allocated) {
        THROW_IE_EXCEPTION << "Error memory is not allocated!";
    }
@ -585,6 +615,10 @@ MKLDNNEdgePtr MKLDNNEdge::getSharedEdge() const {
    return memoryFromEdgePtr;
 }

+MKLDNNEdgePtr MKLDNNEdge::getSharedEdge(std::nothrow_t) const {
+    return memoryFromEdge.lock();
+}
+
 void MKLDNNEdge::init() {
    if (status != Status::NeedAllocation && status != Status::Uninitialized)
        return;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
@ -8,6 +8,7 @@

 #include "mkldnn_memory.h"
 #include "mkldnn_dims.h"
+#include "mkldnn_weights_cache.hpp"
 #include "mkldnn/ie_mkldnn.h"

 #include <map>
@ -42,9 +43,10 @@ public:

    void changeStatus(Status state);

-    virtual void init();
-    virtual void allocate(const void* mem_ptr = nullptr);
-    virtual void validate();
+    void init();
+    void allocate(const void* mem_ptr = nullptr);
+    void externalAllocate(MKLDNNWeightsSharing::Ptr weightsCache);
+    void validate();
    void drop();

    const std::shared_ptr<MKLDNNNode> getParent() const;
@ -59,12 +61,17 @@ public:

    bool needReorder();
    bool isDropped();
+    bool isUseExternalMemory() const;

    int getInputNum();
    int getOutputNum();

    void sharedMemFrom(const MKLDNNEdgePtr& edge);
    MKLDNNEdgePtr getSharedEdge() const;
+    MKLDNNEdgePtr getSharedEdge(std::nothrow_t) const;
+
+private:
+    std::string name() const;

 private:
    std::weak_ptr<MKLDNNNode> parent;
@ -72,6 +79,7 @@ private:
    int parent_port;
    int child_port;

+    bool externalMemoryPtr = false;
    MKLDNNEdgeWeakPtr memoryFromEdge;
    MKLDNNDims dims;
    MKLDNNMemoryPtr memoryPtr;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -64,6 +64,9 @@ using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;

+typedef std::unordered_set<MKLDNNEdgePtr> edge_cluster_t;
+typedef std::vector<edge_cluster_t> edge_clusters_t;
+
 template<typename NET>
 void MKLDNNGraph::ApplyUnrollPasses(NET &net) {
    OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, "MKLDNNGraph::ApplyUnrollPasses");
@ -445,10 +448,40 @@ void MKLDNNGraph::InitOptimalPrimitiveDescriptors() {
 void MKLDNNGraph::ExecuteConstantNodesOnly() {
    OV_ITT_SCOPED_TASK(itt::domains::MKLDNN_LT, "MKLDNNGraph::ExecuteConstantNodesOnly");
    mkldnn::stream stream(eng);
+
+    using shared_memory_ptr = MKLDNNWeightsSharing::MKLDNNSharedMemory::Ptr;
+
+    auto acquireSharedOutputs = [this](MKLDNNNodePtr & graphNode) {
+        std::vector<shared_memory_ptr> outputs;
+
+        for (size_t i = 0; i < graphNode->getChildEdges().size(); ++i) {
+            auto edgePtr = graphNode->getChildEdgeAt(i);
+            if (edgePtr && edgePtr->isUseExternalMemory()) {
+                outputs.emplace_back(weightsCache->get(edgePtr->name()));
+            }
+        }
+
+        return outputs;
+    };
+
    for (auto &graphNode : graphNodes) {
        if (!graphNode->isConstant())
            continue;
-        graphNode->execute(stream);
+
+        if (weightsCache) {
+            auto sharedOutputs = acquireSharedOutputs(graphNode);
+
+            if (std::find_if(sharedOutputs.begin(), sharedOutputs.end(),
+                [](const shared_memory_ptr & ptr) {
+                    return !ptr->isValid();
+                }) != sharedOutputs.end()) {
+                graphNode->execute(stream);
+                for (auto & output : sharedOutputs)
+                    output->valid(true);
+            }
+        } else {
+            graphNode->execute(stream);
+        }
    }
 }

@ -526,81 +559,85 @@ static inline bool isConstOutput(MKLDNNEdgePtr edge) {
    return edge->getParent()->isConstant() && !edge->getChild()->isConstant();
 }

+static edge_clusters_t findEdgeClusters(const std::vector<MKLDNNEdgePtr> & graphEdges) {
+    typedef std::unordered_map<MKLDNNEdgePtr, size_t> edge_cluster_idx_map_t;
+
+    edge_clusters_t edge_clusters;
+    edge_cluster_idx_map_t edge_cluster_indices;
+
+    for (auto &edge : graphEdges) {
+        auto edge_it = edge_cluster_indices.find(edge);
+
+        if (edge_it != edge_cluster_indices.end())
+            continue;   // edge is visited
+
+        size_t cluster_idx = edge_clusters.size();
+        MKLDNNEdgePtr last_shared_edge = nullptr;
+
+        // find cluster index
+        for (auto shared_edge = edge->getSharedEdge(std::nothrow);
+            shared_edge;
+            shared_edge = shared_edge->getSharedEdge(std::nothrow)) {
+            auto shared_edge_it = edge_cluster_indices.find(shared_edge);
+            if (shared_edge_it != edge_cluster_indices.end()) {
+                cluster_idx = shared_edge_it->second;
+                last_shared_edge = shared_edge;
+                break;
+            }
+        }
+
+        // add shared edges to cluster
+        edge_cluster_indices.emplace(edge, cluster_idx);
+
+        if (cluster_idx == edge_clusters.size())
+            edge_clusters.emplace_back(edge_cluster_t { edge });
+        else
+            edge_clusters[cluster_idx].emplace(edge);
+
+        for (auto shared_edge = edge->getSharedEdge(std::nothrow);
+            shared_edge != last_shared_edge;
+            shared_edge = shared_edge->getSharedEdge(std::nothrow)) {
+            edge_cluster_indices.emplace(shared_edge, cluster_idx);
+            edge_clusters[cluster_idx].emplace(shared_edge);
+        }
+    }
+
+    return edge_clusters;
+}
+
 void MKLDNNGraph::AllocateWithReuse() {
-    std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters;
+    edge_clusters_t edge_clusters = findEdgeClusters(graphEdges);

-    // detect edge clusters which are view on one.
-    for (auto &edge : graphEdges) {
-        MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated)
-                            ? edge->getSharedEdge()
-                            : nullptr;
-        if (par) {
-            bool found = false;
-            for (auto &claster : edge_clasters) {
-                for (auto &element : claster) {
-                    if (element == par) {
-                        if (std::find(claster.begin(), claster.end(), edge) == claster.end())
-                            claster.push_back(edge);
-                        found = true;
-                        break;
-                    }
-                }
+    size_t edge_clusters_count = edge_clusters.size();
+
+    for (size_t i = 0; i < edge_clusters_count;) {
+        auto &cluster = edge_clusters[i];
+        bool erase = false;
+        for (auto &edge : cluster) {
+            if (edge->getStatus() == MKLDNNEdge::Status::NeedAllocation
+                && edge->getParent()->isConstant()) {
+                edge->externalAllocate(weightsCache);
+                erase = true;
            }
-            if (!found)
-                edge_clasters.push_back({par, edge});
+        }
+
+        if (erase) {
+            std::swap(edge_clusters[i], edge_clusters[edge_clusters_count - 1]);
+            --edge_clusters_count;
        } else {
-            bool found = false;
-            for (auto &claster : edge_clasters) {
-                for (auto &element : claster) {
-                    if (element == edge) {
-                        found = true;
-                        break;
-                    }
-                }
-            }
-            if (!found)
-                edge_clasters.push_back({edge});
+            ++i;
        }
    }

-    //======= WA. getSharedEdge() returns not identical edges ============
-    //  Will try to merge clasters with matched edges
-    for (auto &edge : graphEdges) {
-        std::vector<decltype(&edge_clasters[0])> to_merge;
-
-        for (auto &claster : edge_clasters)
-            if (std::find(claster.begin(), claster.end(), edge) != claster.end())
-                to_merge.push_back(&claster);
-
-        if (to_merge.size() > 1) {
-            // Merge clasters
-            auto base_classter = to_merge[0];
-            for (int i = 1; i < to_merge.size(); i++) {
-                base_classter->insert(base_classter->end(),
-                                      to_merge[i]->begin(), to_merge[i]->end());
-                to_merge[i]->clear();
-            }
-
-            // remove duplicates in merged claster
-            std::sort(base_classter->begin(), base_classter->end());
-            base_classter->erase(std::unique(base_classter->begin(), base_classter->end()),
-                                 base_classter->end() );
-
-            // remove empty clasters
-            edge_clasters.erase(std::remove_if(edge_clasters.begin(), edge_clasters.end(),
-                                               [] ( std::vector<MKLDNNEdgePtr> &cls) { return cls.empty(); }),
-                                edge_clasters.end());
-        }
-    }
-    //======= End of WA ============
+    edge_clusters.resize(edge_clusters_count);

    const int64_t alignment = 32;  // 32 bytes

-    std::vector<MemorySolver::Box> boxes(edge_clasters.size());
-    for (int i = 0; i < edge_clasters.size(); i++) {
+    std::vector<MemorySolver::Box> boxes(edge_clusters.size());
+    for (int i = 0; i < edge_clusters.size(); i++) {
        MemorySolver::Box &box = boxes[i];
        box = { std::numeric_limits<int>::max(), 0, 0, i };
-        for (auto &edge : edge_clasters[i]) {
+        for (auto &edge : edge_clusters[i]) {
            int e_start = edge->getParent()->execIndex;
            int e_finish = edge->getChild()->execIndex;

@ -630,7 +667,7 @@ void MKLDNNGraph::AllocateWithReuse() {
        // So we need it untouchable during all execution time
        // -1 is a place holder for a max timestamp.
        bool isConst = false, isOutput = false, isInput = false;
-        for (auto &edge : edge_clasters[i]) {
+        for (auto &edge : edge_clusters[i]) {
            isConst  |= isConstOutput(edge);
            isOutput |= edge->getChild()->getType() == Output;
            isInput  |= edge->getParent()->getType() == Input;
@ -654,11 +691,15 @@ void MKLDNNGraph::AllocateWithReuse() {

    memWorkspace = std::make_shared<MKLDNNMemory>(eng);
    memWorkspace->Create(MKLDNNMemoryDesc(TensorDesc(Precision::I8, {total_size}, Layout::C)));
+
+    if (edge_clusters.empty())
+        return;
+
    auto* workspace_ptr = static_cast<int8_t*>(memWorkspace->GetData());

-    for (int i = 0; i < edge_clasters.size(); i++) {
+    for (int i = 0; i < edge_clusters.size(); i++) {
        int count = 0;
-        for (auto &edge : edge_clasters[i]) {
+        for (auto &edge : edge_clusters[i]) {
            if (edge->getStatus() == MKLDNNEdge::Status::NeedAllocation) {
                int64_t offset = memSolver.getOffset(i);
                // !! Fallback to individual memory allocation !!
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -766,7 +766,7 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri
                                            + "_" + std::to_string(internalBlob->byteSize())
                                            + "_" + std::to_string(data_hash);

-            ptr = weightCache->findOrCreate(string_hash, create);
+            ptr = *weightCache->findOrCreate(string_hash, create);
        } else {
            ptr = create();
        }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_weights_cache.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_weights_cache.cpp
@ -11,6 +11,66 @@ namespace MKLDNNPlugin {

 const SimpleDataHash MKLDNNWeightsSharing::simpleCRC;

+MKLDNNWeightsSharing::MKLDNNSharedMemory::MKLDNNSharedMemory(
+        std::unique_lock<std::mutex> && lock,
+        const MKLDNNMemoryInfo::Ptr & memory,
+        MKLDNNMemoryPtr newPtr)
+    : lock(std::move(lock))
+    , memory(memory)
+    , newPtr(newPtr)
+{}
+
+MKLDNNWeightsSharing::MKLDNNSharedMemory::operator MKLDNNMemoryPtr() const {
+    return memory->sharedMemory.lock();
+}
+
+bool MKLDNNWeightsSharing::MKLDNNSharedMemory::isValid() const {
+    return memory->valid;
+}
+
+void MKLDNNWeightsSharing::MKLDNNSharedMemory::valid(bool b) {
+    memory->valid = b;
+}
+
+MKLDNNWeightsSharing::MKLDNNSharedMemory::Ptr MKLDNNWeightsSharing::findOrCreate(
+                            const std::string& key,
+                            std::function<MKLDNNMemoryPtr(void)> create,
+                            bool valid) {
+    std::unique_lock<std::mutex> lock(guard);
+    auto found = sharedWeights.find(key);
+
+    MKLDNNMemoryInfo::Ptr ptr;
+    MKLDNNMemoryPtr newPtr;
+
+    if (found == sharedWeights.end()
+        || !(ptr = found->second)
+        || ptr->sharedMemory.expired()) {
+        newPtr = create();
+        ptr = std::make_shared<MKLDNNMemoryInfo>(newPtr, valid);
+        sharedWeights[key] = ptr;
+    }
+
+    return std::make_shared<MKLDNNSharedMemory>(ptr->valid
+                                                ? std::unique_lock<std::mutex>(ptr->guard, std::defer_lock)
+                                                : std::unique_lock<std::mutex>(ptr->guard), ptr, newPtr);
+}
+
+MKLDNNWeightsSharing::MKLDNNSharedMemory::Ptr MKLDNNWeightsSharing::get(const std::string& key) const {
+    std::unique_lock<std::mutex> lock(guard);
+    auto found = sharedWeights.find(key);
+
+    MKLDNNMemoryInfo::Ptr ptr;
+
+    if (found == sharedWeights.end()
+        || !(ptr = found->second)
+        || ptr->sharedMemory.expired())
+        THROW_IE_EXCEPTION << "Unknown shared memory with key " << key;
+
+    return std::make_shared<MKLDNNSharedMemory>(ptr->valid
+                                                ? std::unique_lock<std::mutex>(ptr->guard, std::defer_lock)
+                                                : std::unique_lock<std::mutex>(ptr->guard), ptr);
+}
+
 NumaNodesWeights::NumaNodesWeights() {
    for (auto numa_id : InferenceEngine::getAvailableNUMANodes())
        _cache_map[numa_id] = std::make_shared<MKLDNNWeightsSharing>();
--- a/inference-engine/src/mkldnn_plugin/mkldnn_weights_cache.hpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_weights_cache.hpp
@ -52,25 +52,51 @@ protected:
 * Is a thread safe
 */
 class MKLDNNWeightsSharing {
+    struct MKLDNNMemoryInfo {
+        typedef std::shared_ptr<MKLDNNMemoryInfo> Ptr;
+
+        MKLDNNMemoryInfo(MKLDNNMemoryPtr memoryPtr, bool valid)
+            : sharedMemory(memoryPtr)
+            , valid(valid)
+        {}
+
+        std::mutex guard;
+        std::weak_ptr<MKLDNNMemory> sharedMemory;
+        bool valid;
+    };
+
 public:
    typedef std::shared_ptr<MKLDNNWeightsSharing> Ptr;
-    MKLDNNMemoryPtr findOrCreate(const std::string& name_hash,
-                             std::function<MKLDNNMemoryPtr(void)> create) {
-        std::unique_lock<std::mutex> lock(guard);
-        auto found = sharedWeights.find(name_hash);

-        MKLDNNMemoryPtr ptr;
-        if (found == sharedWeights.end() || !(ptr = found->second.lock())) {
-            ptr = create();
-            sharedWeights[name_hash] = ptr;
-        }
-        return ptr;
-    }
+    class MKLDNNSharedMemory {
+    public:
+        typedef std::shared_ptr<MKLDNNSharedMemory> Ptr;
+
+        MKLDNNSharedMemory(std::unique_lock<std::mutex> && lock,
+                           const MKLDNNMemoryInfo::Ptr & memory,
+                           MKLDNNMemoryPtr newPtr = nullptr);
+
+        operator MKLDNNMemoryPtr() const;
+        bool isValid() const;
+        void valid(bool b);
+
+    private:
+        std::unique_lock<std::mutex> lock;
+        MKLDNNMemoryInfo::Ptr memory;
+        MKLDNNMemoryPtr newPtr;
+    };
+
+    MKLDNNSharedMemory::Ptr findOrCreate(const std::string& key,
+                                         std::function<MKLDNNMemoryPtr(void)> create,
+                                         bool valid = true);
+
+    MKLDNNSharedMemory::Ptr get(const std::string& key) const;
+
    static const SimpleDataHash& GetHashFunc () { return simpleCRC; }

 protected:
-    std::unordered_map<std::string, std::weak_ptr<MKLDNNMemory>> sharedWeights;
-    std::mutex guard;
+    mutable std::mutex guard;
+    std::unordered_map<std::string, MKLDNNMemoryInfo::Ptr> sharedWeights;
    static const SimpleDataHash simpleCRC;
 };

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
@ -33,4 +33,3 @@ private:
 };

 }  // namespace MKLDNNPlugin
-
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
@ -64,4 +64,3 @@ private:
 };

 }  // namespace MKLDNNPlugin
-