Dynamic shape mem reuse solution (#11667)

* Dynamic shape memory reuse solution * Fix Split node to properly work with dyn mem * Fix race condition for Memory mgrHandle * Avoid Memory race condition between GetData and SetDataHandle Add a lock for race condition between ov::intel_cpu::Memory::GetData() and ov::intel_cpu::Memory::SetDataHandle() is not a good solution, which will impact the inference performance. We found that it is unnecessary get edge DataPtr in inferRequest::SetBlob or GetBlob, which only need the tensorDesc, so we can only get tensorDesc to replace get dataPtr to avoid this race condition. * Resolve reviewer's comments * Avoid performance impact due to frenquent reset MemMngrHandle If MemMngrHandle already has been assigned an external buffer, it can be reused. Else it need create a new one.
2022-06-01 18:49:47 +08:00
parent 8b1ed3d5b2
commit 042bd7274a
11 changed files with 154 additions and 106 deletions
--- a/src/inference/dev_api/memory_solver.hpp
+++ b/src/inference/dev_api/memory_solver.hpp
@@ -63,33 +63,31 @@ public:
        int64_t id;
    };

-    explicit MemorySolver(const std::vector<Box>& boxes) : _boxes(boxes) {
+    /** @brief Performes inplace normalization of the input boxes
+        @return lifespan of all boxes
+    */
+    static int normalizeBoxes(std::vector<Box>& boxes) {
        int max_ts = 0;
-        // TODO: add validation of data correctness:
-        // 1. Box.start >= 0 and Box.finish >= -1
-        // 2. Box.finish >= Box.start (except Box.finish == -1)
-        // 3. Box.size > 0 (or == 0 ?)
-        // 4. Box.id == any unique value
-        for (const Box& box : _boxes)
+        for (const Box& box : boxes)
            max_ts = std::max(std::max(max_ts, box.start), box.finish);
-        for (Box& box : _boxes)
+        for (Box& box : boxes)
            if (box.finish == -1)
                box.finish = max_ts;

        // sort by start and finish ts
-        std::sort(_boxes.begin(), _boxes.end(), [](const Box& l, const Box& r) -> bool {
+        std::sort(boxes.begin(), boxes.end(), [](const Box& l, const Box& r) -> bool {
            return l.start < r.start || (l.start == r.start && l.finish < r.finish);
        });

        // remove unused timestamps (not a begin of some box)
        // each ts should start a box
        std::vector<bool> ts_exist(max_ts + 1);
-        for (const Box& b : _boxes)
+        for (const Box& b : boxes)
            ts_exist[b.start] = true;

        int rm_ts_s = 0, rm_ts_f = 0;
        int ts_s = 0, ts_f = 0;
-        for (Box& b : _boxes) {
+        for (Box& b : boxes) {
            while (ts_s < b.start)
                if (!ts_exist[ts_s++])
                    rm_ts_s++;
@@ -105,7 +103,16 @@ public:
            b.start -= rm_ts_s;
            b.finish -= rm_ts_f;
        }
-        _time_duration = ts_f - rm_ts_f;
+        return ts_f - rm_ts_f;
+    }
+
+    explicit MemorySolver(const std::vector<Box>& boxes) : _boxes(boxes) {
+        // TODO: add validation of data correctness:
+        // 1. Box.start >= 0 and Box.finish >= -1
+        // 2. Box.finish >= Box.start (except Box.finish == -1)
+        // 3. Box.size > 0 (or == 0 ?)
+        // 4. Box.id == any unique value
+        _time_duration = normalizeBoxes(_boxes);
    }

    inline bool popupTogetherWith(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
--- a/src/plugins/intel_cpu/src/cpu_memory.cpp
+++ b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -136,6 +136,12 @@ DnnlMemoryDescPtr Memory::GetDescWithType<DnnlMemoryDesc, 0, 0>() const {
 }

 void Memory::setDataHandle(void *data) {
+    if (!mgrHandle->hasExtBuffer()) {
+        mgrHandle = DnnlMemMngrHandle(
+            std::make_shared<DnnlMemoryMngr>(std::unique_ptr<MemoryMngrWithReuse>(new MemoryMngrWithReuse())),
+            this);
+    }
+
    size_t maxMemSize = pMemDesc->hasDefinedMaxSize() ?  pMemDesc->getMaxMemSize() : 0;
    mgrHandle->setExtBuff(data, maxMemSize);
    prim->set_data_handle(mgrHandle->getRawPtr()); // for pads zeroing, to preserve dnnl::memory::set_data_handle behaviour
--- a/src/plugins/intel_cpu/src/cpu_memory.h
+++ b/src/plugins/intel_cpu/src/cpu_memory.h
@@ -176,6 +176,9 @@ public:
        return prim != nullptr;
    }

+    /**
+     * @brief Resets the memory manager to a new one created with the provided raw memory
+     */
    void setDataHandle(void* data);

    const MemoryDesc& getDesc() const {
--- a/src/plugins/intel_cpu/src/edge.cpp
+++ b/src/plugins/intel_cpu/src/edge.cpp
@@ -258,7 +258,7 @@ int Edge::getOutputNum() const {
    return child_port;
 }

-void Edge::allocate(const void* mem_ptr) {
+void Edge::allocateCommon(const std::function<void(const MemoryPtr&, const MemoryDesc&)>& allocate) {
    if (status != Status::NeedAllocation)
        return;

@@ -272,11 +272,30 @@ void Edge::allocate(const void* mem_ptr) {

    auto parentPtr = getParent();
    memoryPtr.reset(new Memory(parentPtr->getEngine()));
-
-    memoryPtr->Create(inputDesc, mem_ptr, false);  // no pads zeroing
+    allocate(memoryPtr, inputDesc);
    status = Status::Allocated;
 }

+void Edge::allocate(const void* mem_ptr) {
+    auto allocateFunc = [=](const MemoryPtr& memoryPtr, const MemoryDesc& inputDesc) {
+        memoryPtr->Create(inputDesc, mem_ptr, false);  // no pads zeroing
+    };
+
+    allocateCommon(allocateFunc);
+}
+
+void Edge::allocate(DnnlMemoryMngrPtr memMngr) {
+    if (!memMngr) {
+        IE_THROW(Unexpected) << "Memory manager ptr is NULL";
+    }
+
+    auto allocateFunc = [=](const MemoryPtr& memoryPtr, const MemoryDesc& inputDesc) {
+        memoryPtr->Create(inputDesc, memMngr);
+    };
+
+    allocateCommon(allocateFunc);
+}
+
 std::string Edge::name() const {
    auto parentPtr = getParent();
    auto childPtr = getChild();
@@ -289,34 +308,10 @@ std::string Edge::name() const {
 }

 void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) {
-    auto isInPlace = [](const NodePtr node, int port) -> bool {
-        const auto& selected_pd = node->getSelectedPrimitiveDescriptor();
-        if (selected_pd == nullptr)
-            IE_THROW() << "Preferable primitive descriptor is not set.";
-
-        const auto& config = selected_pd->getConfig();
-
-        for (const auto& in : config.inConfs) {
-            if (in.inPlace() == port) {
-                return true;
-            }
-        }
-        for (const auto& out : config.outConfs) {
-            if (out.inPlace() == port) {
-                return true;
-            }
-        }
-
-        return false;
-    };
-
    if (status != Status::NeedAllocation)
        return;

-    bool isTheOnlyChildEdgeAtPort = getParent()->getChildEdgesAtPort(getInputNum()).size() == 1;
-    bool isConcurrentUpdatePossible = isInPlace(getParent(), getInputNum()) || isInPlace(getChild(), getOutputNum()) || !isTheOnlyChildEdgeAtPort;
-
-    if (weightsCache && !isConcurrentUpdatePossible) {
+    if (weightsCache) {
        auto alloc = [this] () {
            allocate();
            return memoryPtr;
--- a/src/plugins/intel_cpu/src/edge.h
+++ b/src/plugins/intel_cpu/src/edge.h
@@ -51,6 +51,7 @@ public:

    void init();
    void allocate(const void* mem_ptr = nullptr);
+    void allocate(DnnlMemoryMngrPtr memMngr);
    void externalAllocate(WeightsSharing::Ptr weightsCache);
    void reuse(MemoryPtr ptr);
    void validate();
@@ -104,6 +105,8 @@ private:

    EdgePtr getBaseEdge(int look = LOOK_BOTH);
    bool inPlace(LOOK look = LOOK_BOTH);
+    void allocateCommon(const std::function<void(const MemoryPtr&, const MemoryDesc&)>& allocate);
+
    friend class Graph;
 };

--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -606,25 +606,17 @@ static edge_clusters_t findEdgeClusters(const std::vector<EdgePtr> & graphEdges)
    edge_cluster_idx_map_t edge_cluster_indices;

    for (auto &edge : graphEdges) {
-        if (!edge->hasDefinedMaxSize())
-            continue;
-
        auto edge_it = edge_cluster_indices.find(edge);
-
        if (edge_it != edge_cluster_indices.end())
            continue;   // edge is visited

        size_t cluster_idx = edge_clusters.size();
        EdgePtr last_shared_edge = nullptr;
-        //has_defined_max_path means all the edges on path from current to the actual shared edge
-        //have defined max memory size so they can be added to the clusters and resolved by mem solver
-        bool has_defined_max_path = true;

        // find cluster index
        for (auto shared_edge = edge->getSharedEdge(std::nothrow);
            shared_edge;
            shared_edge = shared_edge->getSharedEdge(std::nothrow)) {
-            has_defined_max_path = has_defined_max_path && shared_edge->hasDefinedMaxSize();
            auto shared_edge_it = edge_cluster_indices.find(shared_edge);
            if (shared_edge_it != edge_cluster_indices.end()) {
                cluster_idx = shared_edge_it->second;
@@ -633,10 +625,6 @@ static edge_clusters_t findEdgeClusters(const std::vector<EdgePtr> & graphEdges)
            }
        }

-        if (!has_defined_max_path) {
-            continue;
-        }
-
        // add shared edges to cluster
        edge_cluster_indices.emplace(edge, cluster_idx);

@@ -689,22 +677,24 @@ void Graph::AllocateWithReuse() {

    const int64_t alignment = 32;  // 32 bytes

-    std::vector<MemorySolver::Box> boxes(edge_clusters.size());
+    std::vector<MemorySolver::Box> definedBoxes;
+    std::vector<MemorySolver::Box> undefinedBoxes;
    for (int i = 0; i < edge_clusters.size(); i++) {
-        MemorySolver::Box &box = boxes[i];
-        box = { std::numeric_limits<int>::max(), 0, 0, i };
+        MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
+        int64_t boxSize = 0;
        for (auto &edge : edge_clusters[i]) {
            int e_start = edge->getParent()->execIndex;
            int e_finish = edge->getChild()->execIndex;

-            if (!edge->hasDefinedMaxSize()) {
-                IE_THROW() << "Can not allocate memory since the size is undefined.";
+            if (boxSize != -1 && edge->getDesc().hasDefinedMaxSize()) {
+                int64_t e_size = edge->getDesc().getMaxMemSize();  // size in bytes (from the beginning of data to the last element)
+                boxSize = std::max(e_size, boxSize);
+            } else {
+                boxSize = -1;
            }

-            int64_t e_size = edge->getDesc().getMaxMemSize();  // size in bytes (from the beginning of data to the last element)
            box.start = std::min(e_start, box.start);
            box.finish = std::max(e_finish, box.finish);
-            box.size =  std::max(e_size, box.size);
        }

        // Constant data are filled once on load.
@@ -727,11 +717,17 @@ void Graph::AllocateWithReuse() {
            }
        }

-        box.size = div_up(box.size, alignment);
+        if (boxSize != -1) {
+            box.size = div_up(boxSize, alignment);
+            definedBoxes.push_back(box);
+        } else {
+            box.size = boxSize;
+            undefinedBoxes.push_back(box);
+        }
    }

-    MemorySolver memSolver(boxes);
-    size_t total_size = static_cast<size_t>(memSolver.solve()) * alignment;
+    MemorySolver staticMemSolver(definedBoxes);
+    size_t total_size = static_cast<size_t>(staticMemSolver.solve()) * alignment;

    memWorkspace = std::make_shared<Memory>(eng);
    memWorkspace->Create(DnnlBlockedMemoryDesc(InferenceEngine::Precision::I8, Shape(InferenceEngine::SizeVector{total_size})));
@@ -741,11 +737,11 @@ void Graph::AllocateWithReuse() {

    auto* workspace_ptr = static_cast<int8_t*>(memWorkspace->GetData());

-    for (int i = 0; i < edge_clusters.size(); i++) {
+    for (auto& box : definedBoxes) {
        int count = 0;
-        for (auto &edge : edge_clusters[i]) {
+        for (auto& edge : edge_clusters[box.id]) {
            if (edge->getStatus() == Edge::Status::NeedAllocation) {
-                int64_t offset = memSolver.getOffset(i);
+                int64_t offset = staticMemSolver.getOffset(box.id);
                // !! Fallback to individual memory allocation !!
                // if you like to check infer without reuse just call this function without arguments.
                edge->allocate(workspace_ptr + offset * alignment);  // alignment in byte
@@ -761,6 +757,40 @@ void Graph::AllocateWithReuse() {
        }
        IE_ASSERT(count == 1);
    }
+
+    if (!undefinedBoxes.empty()) {
+        MemorySolver::normalizeBoxes(undefinedBoxes);
+
+        std::vector<std::vector<MemorySolver::Box>> groups; //groups of nonoverlapping boxes
+        groups.push_back({undefinedBoxes.front()});
+        for (size_t i = 1; i < undefinedBoxes.size(); ++i) {
+            const auto& box = undefinedBoxes[i];
+            bool groupFound = false;
+            for (auto& group : groups) {
+                const auto& lastBox = group.back();
+                if (lastBox.start > box.finish || lastBox.finish < box.start) {
+                    group.push_back(box);
+                    groupFound = true;
+                    break;
+                }
+            }
+
+            if (!groupFound) {
+                groups.push_back({box});
+            }
+        }
+        for (auto& group : groups) {
+            auto grpMemMngr =
+                std::make_shared<DnnlMemoryMngr>(std::unique_ptr<MemoryMngrWithReuse>(new MemoryMngrWithReuse()));
+            for (auto& box : group) {
+                for (auto& edge : edge_clusters[box.id]) {
+                    if (edge->getStatus() == Edge::Status::NeedAllocation) {
+                        edge->allocate(grpMemMngr);
+                    }
+                }
+            }
+        }
+    }
 }

 void Graph::Allocate() {
@@ -774,9 +804,6 @@ void Graph::Allocate() {
    // Allocate memory space for all edges marked with NeedAllocation
    AllocateWithReuse();

-    // Create dummy memory with undefined desc for edges that are need allocation but has not been allocated withing mem solver
-    for (auto& edge : graphEdges) edge->allocate();
-
    // Resolve all other edges with status NotAllocated and in-place
    for (auto& node : graphNodes) node->resolveInPlaceEdges();

--- a/src/plugins/intel_cpu/src/infer_request.cpp
+++ b/src/plugins/intel_cpu/src/infer_request.cpp
@@ -431,12 +431,8 @@ void LegacyInferRequest::SetBlob(const std::string& name, const InferenceEngine:
                IE_THROW(ParameterMismatch) << "Failed to set input blob. Blocking descriptor mismatch.";
            }

-            auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
-            if (!pBlob) {
-                IE_THROW() << "Blob returned after trying to interpret input node's memory is nullable. Input node name: " << name;
-            }
-
-            if (data->getTensorDesc() == pBlob->getTensorDesc() &&
+            auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
+            if (data->getTensorDesc() == pBlobDesc &&
                graph->_normalizePreprocMap.find(name) == graph->_normalizePreprocMap.end() && !graph->getProperty().batchLimit) {
                externalPtr[name] = data->buffer();
            } else if (externalPtr.find(name) != externalPtr.end()) {
@@ -469,11 +465,8 @@ void LegacyInferRequest::SetBlob(const std::string& name, const InferenceEngine:
                IE_THROW(ParameterMismatch) << "Failed to set output blob. Blocking descriptor mismatch.";
        }

-        auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
-        if (!pBlob)
-            IE_THROW() << "Blob returned after trying to interpret output node's memory is nullable. Output node name: " << name;
-
-        if (data->getTensorDesc() == pBlob->getTensorDesc() &&
+        auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
+        if (data->getTensorDesc() == pBlobDesc &&
                !graph->getProperty().batchLimit) {
            externalPtr[name] = data->buffer();
        } else if (externalPtr.find(name) != externalPtr.end()) {
@@ -502,12 +495,8 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
        }

        if (_inputs.find(name) == _inputs.end()) {
-            auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
-            if (!pBlob) {
-                IE_THROW() << "Blob returned after trying to interpret input node's memory is nullable. Input node name: " << name;
-            }
-
-            InferenceEngine::TensorDesc desc = pBlob->getTensorDesc();
+            auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
+            InferenceEngine::TensorDesc desc = pBlobDesc;

            if (_networkInputs.find(name) != _networkInputs.end()) {
                InferenceEngine::Layout l = _networkInputs[name]->getLayout();
@@ -519,7 +508,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)

            _inputs[name] = make_blob_with_precision(desc);
            _inputs[name]->allocate();
-            if (pBlob->getTensorDesc() == desc &&
+            if (pBlobDesc == desc &&
                graph->_normalizePreprocMap.find(name) == graph->_normalizePreprocMap.end() && !graph->getProperty().batchLimit) {
                externalPtr[name] = _inputs[name]->buffer();
            }
@@ -547,11 +536,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)

    if (graph->hasOutputWithName(name)) {
        if (_outputs.find(name) == _outputs.end()) {
-            auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
-            if (!pBlob) {
-                IE_THROW() << "Blob returned after trying to interpret output node's memory is nullable. Output node name: " << name;
-            }
-
+            auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
            if (!data) {
                InferenceEngine::TensorDesc desc = _networkOutputs[name]->getTensorDesc();
                desc.setPrecision(normalizeToSupportedPrecision(desc.getPrecision()));
@@ -566,7 +551,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
                data = make_blob_with_precision(desc);
                data->allocate();
            } else {
-                const auto& expectedTensorDesc = pBlob->getTensorDesc();
+                const auto& expectedTensorDesc = pBlobDesc;

                if (expectedTensorDesc.getPrecision() != data->getTensorDesc().getPrecision()) {
                    IE_THROW(ParameterMismatch) << "Network input and output use the same name: " << name << " but expect blobs with different precision: "
@@ -586,7 +571,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
            }

            _outputs[name] = data;
-            if (!externalPtr.count(name) && data->getTensorDesc() == pBlob->getTensorDesc() && !graph->getProperty().batchLimit) {
+            if (!externalPtr.count(name) && data->getTensorDesc() == pBlobDesc && !graph->getProperty().batchLimit) {
                externalPtr[name] = data->buffer();
            }
        }
--- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp
+++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.cpp
@@ -98,6 +98,13 @@ InferenceEngine::Blob::Ptr MemoryDescUtils::interpretAsBlob(const Memory &mem) {
    return make_blob_with_precision(desc, mem.GetData());
 }

+InferenceEngine::TensorDesc MemoryDescUtils::interpretAsBlobDesc(const Memory &mem) {
+    auto& memDesc = mem.getDesc();
+    InferenceEngine::TensorDesc desc = convertToTensorDesc(memDesc);
+
+    return InferenceEngine::TensorDesc(desc.getPrecision(), memDesc.getShape().getStaticDims(), desc.getBlockingDesc());
+}
+
 InferenceEngine::TensorDesc MemoryDescUtils::convertToTensorDesc(const MemoryDesc& desc) {
    if (auto blockingDesc = dynamic_cast<const BlockedMemoryDesc*>(&desc)) {
        InferenceEngine::BlockingDesc blkDesc = desc.getShape().hasZeroDims() ? InferenceEngine::BlockingDesc(blockingDesc->getBlockDims(),
--- a/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h
+++ b/src/plugins/intel_cpu/src/memory_desc/cpu_memory_desc_utils.h
@@ -66,6 +66,13 @@ public:
     */
    static InferenceEngine::Blob::Ptr interpretAsBlob(const Memory& mem);

+    /**
+     * @brief Creates InferenceEngine::TensorDesc from Memory with the memory reuse
+     * @param desc Memory from which will be created InferenceEngine::Blob
+     * @return InferenceEngine::TensorDesc
+     */
+    static InferenceEngine::TensorDesc interpretAsBlobDesc(const Memory& mem);
+
    /**
     * @brief Converts MemoryDesc to InferenceEngine::TensorDesc
     * @param desc MemoryDesc to be converted
--- a/src/plugins/intel_cpu/src/nodes/split.cpp
+++ b/src/plugins/intel_cpu/src/nodes/split.cpp
@@ -266,11 +266,7 @@ void Split::prepareParams() {
            continue;
        }

-        if (uint8_t* dstData = reinterpret_cast<uint8_t*>(outMemPtr->GetPtr())) {
-            dstMemPtrs.emplace_back(port, dstData);
-        } else {
-            THROW_ERROR << "can't get child edge indx " << port << "data.";
-        }
+        dstMemPtrs.emplace_back(port, outMemPtr);

        if (!canUseOptimizedNspc2Ncsp) {
            outDescs.push_back(outMemPtr->GetDescWithType<BlockedMemoryDesc>());
@@ -306,7 +302,7 @@ void Split::execute(dnnl::stream strm) {

    uint8_t* srcData = reinterpret_cast<uint8_t*>(srcMem.GetPtr());
    IE_ASSERT(execPtr != nullptr);
-    execPtr->exec(srcData, dstMemPtrs, batch, MB);
+    execPtr->exec(srcData, getRawDstMemPtrs(), batch, MB);
 }

 bool Split::created() const {
@@ -506,7 +502,7 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
    const size_t strideOC = DHW * dataSize;

    for (size_t i = 0, sIdx = 0; i < dstMemPtrs.size(); i++) {
-        auto dstData = dstMemPtrs[i].second;
+        auto dstData = reinterpret_cast<uint8_t*>(dstMemPtrs[i].second->GetPtr());

        size_t innerSize = 1;
        auto dims = getChildEdgesAtPort(dstMemPtrs[i].first)[0]->getMemory().getStaticDims();
@@ -533,6 +529,17 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
    }
 }

+std::vector<uint8_t*> Split::getRawDstMemPtrs() const {
+    std::vector<uint8_t*> result(dstMemPtrs.size());
+    for (size_t i = 0; i < dstMemPtrs.size(); ++i) {
+        result[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i].second->GetPtr());
+        if (!result[i]) {
+            THROW_ERROR << "can't get child edge indx " << dstMemPtrs[i].first << " data.";
+        }
+    }
+    return result;
+}
+
 Split::SplitOptimizedExecutor::SplitOptimizedExecutor(BlockedMemoryDescCPtr inDesc, const std::vector<BlockedMemoryDescCPtr> &outDescs,
                                                                const size_t axis) {
    // find axis order position
@@ -576,14 +583,14 @@ Split::SplitOptimizedExecutor::SplitOptimizedExecutor(BlockedMemoryDescCPtr inDe
    }
 }

-void Split::SplitOptimizedExecutor::exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
+void Split::SplitOptimizedExecutor::exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
                                                   const Dim origBatch, const Dim perInferBatch) {
    size_t execCountStrides = countStrides;
    if (origBatch != perInferBatch)
        execCountStrides = execCountStrides / origBatch * perInferBatch;

-    parallel_for2d(dstMemPtrs.size(), execCountStrides, [&](size_t i, size_t j) {
-        uint8_t* dstData = dstMemPtrs[i].second;
+    parallel_for2d(dstRawMemPtrs.size(), execCountStrides, [&](size_t i, size_t j) {
+        uint8_t* dstData = dstRawMemPtrs[i];

        cpu_memcpy(&dstData[j * dataSize[i]],
                   &srcData[srcDataOffsets[i] + j * srcDataStride],
--- a/src/plugins/intel_cpu/src/nodes/split.h
+++ b/src/plugins/intel_cpu/src/nodes/split.h
@@ -36,7 +36,7 @@ public:

 private:
    struct SplitExecutor {
-        virtual void exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
+        virtual void exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
                          const Dim origBatch, const Dim perInferBatch) = 0;
        virtual ~SplitExecutor() = default;
    };
@@ -45,7 +45,7 @@ private:
    struct SplitOptimizedExecutor : public SplitExecutor {
        public:
            SplitOptimizedExecutor(BlockedMemoryDescCPtr inDesc, const std::vector<BlockedMemoryDescCPtr> &outDescs, const size_t axis);
-            void exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
+            void exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
                      const Dim origBatch, const Dim perInferBatch) override;

        private:
@@ -56,11 +56,12 @@ private:
    };

    void optimizedNspc2Ncsp(size_t MB);
+    std::vector<uint8_t*> getRawDstMemPtrs() const;

    bool canUseOptimizedNspc2Ncsp = false;

    size_t axis = 1;
-    std::vector<std::pair<size_t, uint8_t*>> dstMemPtrs;
+    std::vector<std::pair<size_t, MemoryCPtr>> dstMemPtrs;

    size_t INPUTS_NUM = 2;
 };