[CPU] Zero-copy optimizations for model outputs (#18476)

- Implement zero-copy output between plugin graph and infer request, thus eliminate memory copy overhead and optimize performance - Implement double buffer for InferRequest outputs
2023-07-25 14:33:48 +08:00
parent fdfafbb7d2
commit 7fbd3a7ebf
25 changed files with 1239 additions and 95 deletions
--- a/src/common/snippets/src/pass/common_optimizations.cpp
+++ b/src/common/snippets/src/pass/common_optimizations.cpp
@@ -105,7 +105,7 @@ void CommonOptimizations::SplitDimensionM(const std::shared_ptr<ov::snippets::op
    const auto needed_new_dim = m_dim / batch_dim_multiplier;  // m / (LCM(b, nthrs) / b) - needed factors of dimension m

    auto is_optimized = [&](size_t batch_m_dim, size_t new_m_dim) {
-        return batch_m_dim != 1 && new_m_dim >= optimal_m_dim;
+        return batch_m_dim != 1 && new_m_dim >= static_cast<size_t>(optimal_m_dim);
    };

    if (batch_dim_multiplier * needed_new_dim == m_dim) {
--- a/src/inference/dev_api/openvino/runtime/make_tensor.hpp
+++ b/src/inference/dev_api/openvino/runtime/make_tensor.hpp
@@ -71,9 +71,9 @@ ov::SoPtr<ITensor> make_tensor(const std::shared_ptr<InferenceEngine::Blob>& ten
 const InferenceEngine::Blob* get_hardware_blob(const InferenceEngine::Blob* blob);
 InferenceEngine::Blob* get_hardware_blob(InferenceEngine::Blob* blob);

-std::shared_ptr<InferenceEngine::Blob> tensor_to_blob(const ov::SoPtr<ITensor>& tensor,
-                                                      bool unwrap = true,
-                                                      InferenceEngine::TensorDesc desc = {});
+OPENVINO_RUNTIME_API std::shared_ptr<InferenceEngine::Blob> tensor_to_blob(const ov::SoPtr<ITensor>& tensor,
+                                                                           bool unwrap = true,
+                                                                           InferenceEngine::TensorDesc desc = {});
 /** @endcond */

 IE_SUPPRESS_DEPRECATED_END
--- a/src/inference/src/dev/isync_infer_request.cpp
+++ b/src/inference/src/dev/isync_infer_request.cpp
@@ -260,8 +260,9 @@ void ov::ISyncInferRequest::check_tensor(const ov::Output<const ov::Node>& port,
                    " expecting ",
                    port.get_shape(),
                    ".");
-    OPENVINO_ASSERT(std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr) || tensor->data() != nullptr,
-                    "Tensor data equal nullptr!");
+    OPENVINO_ASSERT(
+        std::dynamic_pointer_cast<ov::IRemoteTensor>(tensor._ptr) || tensor->data() != nullptr || is_dynamic,
+        "Tensor data equal nullptr!");
 }

 void ov::ISyncInferRequest::allocate_tensor(
--- a/src/inference/src/ie_layouts.cpp
+++ b/src/inference/src/ie_layouts.cpp
@@ -298,7 +298,9 @@ BlockingDesc::BlockingDesc(const SizeVector& blocked_dims,
    this->offsetPaddingToData = dimOffsets;

    // check that strides are valid
-    {
+    if (!std::any_of(blocked_dims.begin(), blocked_dims.end(), [](const size_t dim) {
+            return dim == 0ul;
+        })) {
        size_t denseStride = 1;

        for (size_t i = 1; i <= strides.size(); i++) {
--- a/src/plugins/intel_cpu/src/cpu_memory.h
+++ b/src/plugins/intel_cpu/src/cpu_memory.h
@@ -32,6 +32,7 @@ namespace ov {
 namespace intel_cpu {

 class Memory;
+class ProxyMemoryMngr;

 /**
 * @interface IMemoryMngr
@@ -313,6 +314,7 @@ public:

 private:
    friend DnnlMemoryMngr;
+    friend ProxyMemoryMngr;

 private:
    void update();
--- a/src/plugins/intel_cpu/src/cpu_tensor.cpp
+++ b/src/plugins/intel_cpu/src/cpu_tensor.cpp
@@ -0,0 +1,98 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cpu_tensor.h"
+#include "ie_ngraph_utils.hpp"
+
+#include "utils/debug_capabilities.h"
+
+namespace ov {
+namespace intel_cpu {
+
+Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} {
+    OPENVINO_ASSERT(m_memptr != nullptr);
+
+    // only support plain data format ncsp.
+    auto memdesc = m_memptr->getDescPtr();
+    OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), "intel_cpu::Tensor only supports memory with ncsp layout.");
+
+    m_element_type = InferenceEngine::details::convertPrecision(memdesc->getPrecision());
+}
+
+void Tensor::set_shape(ov::Shape new_shape) {
+    const auto& shape = m_memptr->getDescPtr()->getShape();
+    if (shape.isStatic()) {
+        DEBUG_LOG("tensor's memory object ", m_memptr.get(), ", ", vec2str(shape.getStaticDims()), " -> ", new_shape.to_string());
+        if (shape.getStaticDims() == new_shape) return;
+    }
+
+    auto desc = m_memptr->getDescPtr();
+    const auto newDesc = desc->cloneWithNewDims(new_shape, true);
+    m_memptr->redefineDesc(newDesc);
+}
+
+const ov::element::Type& Tensor::get_element_type() const {
+    return m_element_type;
+}
+
+const ov::Shape& Tensor::get_shape() const {
+    auto& shape = m_memptr->getDescPtr()->getShape();
+    OPENVINO_ASSERT(shape.isStatic(), "intel_cpu::Tensor has dynamic shape.");
+
+    std::lock_guard<std::mutex> guard(m_lock);
+    m_shape = ov::Shape{shape.getStaticDims()};
+    return m_shape;
+}
+
+size_t Tensor::get_size() const {
+    auto& desc = m_memptr->getDesc();
+    return desc.getShape().getElementsCount();
+}
+
+size_t Tensor::get_byte_size() const {
+    auto& desc = m_memptr->getDesc();
+    return desc.getCurrentMemSize();
+}
+
+const ov::Strides& Tensor::get_strides() const {
+    OPENVINO_ASSERT(m_memptr->getDescPtr()->isDefined(), "intel_cpu::Tensor requires memory with defined strides.");
+
+    std::lock_guard<std::mutex> guard(m_lock);
+    update_strides();
+    return m_strides;
+}
+
+void Tensor::update_strides() const {
+    auto blocked_desc = m_memptr->getDescWithType<BlockedMemoryDesc>();
+    OPENVINO_ASSERT(blocked_desc, "not a valid blocked memory descriptor.");
+    auto& strides = blocked_desc->getStrides();
+    m_strides.resize(strides.size());
+    std::transform(strides.cbegin(), strides.cend(), m_strides.begin(),
+                std::bind1st(std::multiplies<size_t>(), m_element_type.size()));
+}
+
+void* Tensor::data(const element::Type& element_type) const {
+    if (element_type != element::undefined && element_type != element::dynamic) {
+        OPENVINO_ASSERT(element_type == get_element_type(),
+                        "Tensor data with element type ",
+                        get_element_type(),
+                        ", is not representable as pointer to ",
+                        element_type);
+    }
+    return m_memptr->getData();
+}
+
+/**
+ * @brief Creates tensor on graph memory
+ *
+ * @param mem Memory object
+ *
+ * @return Shared pointer to tensor interface
+ */
+std::shared_ptr<ITensor> make_tensor(MemoryPtr mem) {
+    return std::make_shared<Tensor>(mem);
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/cpu_tensor.h
+++ b/src/plugins/intel_cpu/src/cpu_tensor.h
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/runtime/itensor.hpp"
+#include "cpu_memory.h"
+
+namespace ov {
+namespace intel_cpu {
+
+class Tensor : public ITensor {
+public:
+    // Only plain data format is supported.
+    explicit Tensor(MemoryPtr memptr);
+
+    void set_shape(ov::Shape shape) override;
+
+    const ov::element::Type& get_element_type() const override;
+
+    const ov::Shape& get_shape() const override;
+
+    size_t get_size() const override;
+
+    size_t get_byte_size() const override;
+
+    const ov::Strides& get_strides() const override;
+
+    void* data(const element::Type& type = {}) const override;
+
+    MemoryPtr get_memory() {return m_memptr;}
+
+private:
+    void update_strides() const;
+
+    MemoryPtr m_memptr;
+
+    ov::element::Type m_element_type;
+    mutable ov::Shape m_shape;
+    mutable ov::Strides m_strides;
+    mutable std::mutex m_lock;
+};
+
+std::shared_ptr<ITensor> make_tensor(MemoryPtr mem);
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/edge.cpp
+++ b/src/plugins/intel_cpu/src/edge.cpp
@@ -515,6 +515,13 @@ EdgePtr Edge::getBaseEdge(int look) {
            if (edge->inPlace() && edge != edgesForSamePort[0]) return edge;
        }
    }
+
+    // Return the first output edge as the base if there is no inPlace consumers
+    // thus benefits zero-copy of outputs.
+    for (auto edge : edgesForSamePort) {
+        if (Type::Output == edge->getChild()->getType()) return edge;
+    }
+
    return edgesForSamePort[0];
 }

--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -812,8 +812,34 @@ void Graph::AllocateWithReuse() {
    }

    if (!undefinedBoxes.empty()) {
+        // Use proxy memory manager for output edges
+        for (auto& box : undefinedBoxes) {
+            for (auto& edge : edge_clusters[box.id]) {
+                const auto child = edge->getChild();
+                if (child->getType() == Type::Output &&
+                    edge->getStatus() == Edge::Status::NeedAllocation) {
+                    auto proxyMemMngr =
+                        std::make_shared<ProxyMemoryMngr>();
+                    DEBUG_LOG("ProxyMemoryMngr ", proxyMemMngr, " ", this);
+                    edge->allocate(proxyMemMngr);
+
+                    // Store the output memory managers.
+                    // So that, the infer requests can be able to access them.
+                    int count = 0;
+                    for (auto &output : outputNodesMap) {
+                        if (output.second == child) {
+                            outputNodesMemMngrMap[output.first] = proxyMemMngr;
+                            count++;
+                        }
+                    }
+                    // sometimes there are unused output ports.
+                    IE_ASSERT(count <= 1) << "cannot find output node. count " << count;
+                }
+            }
+        }
+
        if (!syncNodesInds.empty()) {
-            //We have to extend the lifespan of thensors that are crossing a sync point border in order to save
+            //We have to extend the lifespan of tensors that are crossing a sync point border in order to save
            //the intermediate computation results from possible loss due to the tensor resize
            std::vector<int> vecIntervals = {0};
            for (const auto& item : syncNodesInds) {
@@ -990,6 +1016,7 @@ void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob::
    }
 }

+// suppose always being shared infer_request intel_cpu::Tensor to Graph if isDynamic.
 void Graph::PullOutputData(BlobMap &out) {
    if (!IsReady())
        IE_THROW() << "Wrong state. Topology not ready.";
@@ -1006,6 +1033,8 @@ void Graph::PullOutputData(BlobMap &out) {
            IE_THROW(Unexpected) << "The CPU plugin graph doesn't contain output node with name: \"" << name << "\"";
        }

+        DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast<void*>(out[name]->buffer()));
+
        const auto actualDesc = MemoryDescUtils::convertToTensorDesc(intr_blob.getDesc());
        auto &expectedDesc = ext_blob->getTensorDesc();

@@ -1029,7 +1058,12 @@ void Graph::PullOutputData(BlobMap &out) {
            if (expectedDesc.getLayout() == InferenceEngine::Layout::BLOCKED) {
                expectedDesc = TensorDesc(expectedDesc.getPrecision(), expectedDesc.getLayout());
            }
+            DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast<void*>(out[name]->buffer()),
+            " dims ", PartialShape(out[name]->getTensorDesc().getDims()), " -> ", PartialShape(outDims),
+            ", intr ptr ", intr_blob.getData(), " , parentedge's memory object ", parentEdge->getMemoryPtr().get());
            out[name]->setShape(outDims);
+            DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast<void*>(out[name]->buffer()),
+            " dims ", PartialShape(out[name]->getTensorDesc().getDims()), ", intr ptr ", intr_blob.getData());
        }

        // check for empty output blob
@@ -1047,6 +1081,8 @@ void Graph::PullOutputData(BlobMap &out) {
        void *ext_blob_ptr = ext_blob->buffer();
        void *intr_blob_ptr = intr_blob.getData();

+        DEBUG_LOG(name, " @ ", intr_blob_ptr, " -> ", ext_blob_ptr, " zero-copy: ", intr_blob_ptr == ext_blob_ptr, " graph ", this, "\r\n");
+
        // That is the same memory. No need to copy
        if (ext_blob_ptr == intr_blob_ptr) continue;

@@ -1313,13 +1349,12 @@ inline void Graph::ExecuteNode(const NodePtr& node, const dnnl::stream& stream)
    DUMP(node, getConfig().debugCaps, infer_count);

    OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, node->profiling.execute);
-
+    DEBUG_LOG(*node);
    if (node->isDynamicNode()) {
        node->executeDynamic(stream);
    } else {
        node->execute(stream);
    }
-    DEBUG_LOG(*node);
 }

 void Graph::Infer(InferRequestBase* request) {
--- a/src/plugins/intel_cpu/src/graph.h
+++ b/src/plugins/intel_cpu/src/graph.h
@@ -19,6 +19,8 @@
 #include <memory>
 #include <atomic>

+#include "proxy_mem_mgr.h"
+
 namespace ov {
 namespace intel_cpu {

@@ -190,6 +192,8 @@ public:
        return graphHasDynamicInput;
    }

+    Status getStatus() const {return status;}
+
 protected:
    void VisitNode(NodePtr node, std::vector<NodePtr>& sortedNodes);

@@ -248,6 +252,8 @@ private:
    std::map<std::string, NodePtr> inputNodesMap;
    std::map<std::string, NodePtr> outputNodesMap;

+    std::unordered_map<std::string, ProxyMemoryMngrPtr> outputNodesMemMngrMap;
+
    // these node pointers (from graphNodes) are to avoid regular checking for
    // constantness of nodes in Infer methods and calls of
    // non-executable (optimized out) nodes, such as Input, Reshape, etc.
--- a/src/plugins/intel_cpu/src/infer_request.cpp
+++ b/src/plugins/intel_cpu/src/infer_request.cpp
@@ -25,6 +25,9 @@
 #include "memory_desc/dnnl_blocked_memory_desc.h"
 #include <transformations/utils/utils.hpp>
 #include <ie_ngraph_utils.hpp>
+#include "proxy_mem_mgr.h"
+#include "openvino/runtime/make_tensor.hpp"
+#include <utils/general_utils.h>

 namespace ov {
 namespace intel_cpu {
@@ -182,6 +185,13 @@ void InferRequestBase::InferImpl() {

    ThrowIfCanceled();

+    // update output control blocks, if any, in order to refresh internal buffers
+    if (Graph::Status::ReadyDynamic == graph->getStatus()) {
+        for (auto&& item : outputControlBlocks) {
+            item.second.update();
+        }
+    }
+
    graph->PullOutputData(_outputs);
 }

@@ -202,93 +212,137 @@ static inline void changeEdgePtr(const EdgePtr &edge, InferenceEngine::Blob::Ptr
 }

 void InferRequestBase::changeDefaultPtr() {
+    const auto& inputNodesMap = graph->GetInputNodesMap();
+    const auto& outputNodesMap = graph->GetOutputNodesMap();
    for (auto& it : externalPtr) {
-        const auto& inputNodesMap = graph->GetInputNodesMap();
        auto input = inputNodesMap.find(it.first);
-        if (input != inputNodesMap.end()) {
-            NodePtr inputNodePtr = input->second;
-            if (inputNodePtr->getChildEdgeAt(0)->getMemory().getData() == static_cast<void*>(it.second->buffer()))
-                continue;
-            auto& childEdges = inputNodePtr->getChildEdges();
-            // Perform checks that the user's memory will not be modified
-            bool canBeInPlace = true;
-            for (auto& childEdge : childEdges) {
-                auto ce = childEdge.lock();
-                if (!ce)
+        if (inputNodesMap.end() == input) {
+            OPENVINO_ASSERT(outputNodesMap.count(it.first), "Cannot find input/output blob: ", it.first);
+            continue;
+        }
+        NodePtr inputNodePtr = input->second;
+        if (inputNodePtr->getChildEdgeAt(0)->getMemory().getData() == static_cast<void*>(it.second->buffer()))
+            continue;
+        auto& childEdges = inputNodePtr->getChildEdges();
+        // Perform checks that the user's memory will not be modified
+        bool canBeInPlace = true;
+        for (auto& childEdge : childEdges) {
+            auto ce = childEdge.lock();
+            if (!ce)
+                IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";
+
+            auto& child = ce->getChild();
+
+            if (child->isConstant()) {
+                canBeInPlace = false;
+                break;
+            }
+
+            // the input memory should be referenced by the children, otherwise it should be written to a
+            // specific location
+            if (ce->inPlace(Edge::LOOK_DOWN)) {
+                canBeInPlace = false;
+                break;
+            }
+
+            if (auto result = ce->modifiedInPlace()) {
+                canBeInPlace = false;
+                break;
+            }
+
+            if (child->getType() == Type::Concatenation && child->isInPlace()) {
+                canBeInPlace = false;
+                break;
+            }
+        }
+        if (canBeInPlace) {
+            for (auto& edge : childEdges) {
+                auto e = edge.lock();
+                if (!e)
                    IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";

-                auto& child = ce->getChild();
-
-                if (child->isConstant()) {
-                    canBeInPlace = false;
-                    break;
-                }
-
-                // the input memory should be referenced by the children, otherwise it should be written to a
-                // specific location
-                if (ce->inPlace(Edge::LOOK_DOWN)) {
-                    canBeInPlace = false;
-                    break;
-                }
-
-                if (auto result = ce->modifiedInPlace()) {
-                    canBeInPlace = false;
-                    break;
-                }
-
-                if (child->getType() == Type::Concatenation && child->isInPlace()) {
-                    canBeInPlace = false;
-                    break;
-                }
+                changeEdgePtr(e, it.second);
            }
-            if (canBeInPlace) {
-                for (auto& edge : childEdges) {
-                    auto e = edge.lock();
-                    if (!e)
-                        IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge";
+        }
+    }

-                    changeEdgePtr(e, it.second);
-                }
-            }
+    for (auto& it : externalPtr) {
+        const auto& name = it.first;
+        auto output = outputNodesMap.find(name);
+        if (outputNodesMap.end() == output) {
            continue;
        }
+        auto parentEdge = output->second->getParentEdgeAt(0);

-        const auto& outputNodesMap = graph->GetOutputNodesMap();
-        auto output = outputNodesMap.find(it.first);
-        if (output != outputNodesMap.end()) {
-            auto parentEdge = output->second->getParentEdgeAt(0);
-            if (parentEdge->getMemory().getData() == static_cast<void*>(it.second->buffer()))
-                continue;
+        if (parentEdge->getMemory().getData() == static_cast<void*>(it.second->buffer()))
+            continue;

-            bool canBeInPlace = true;
-            void* defaultPtr = parentEdge->getMemory().getData();
-            // Cannot be in-place after concat because concat is using different ptrs without offsets
-            auto parent = parentEdge->getParent();
-            NodePtr previousParent;
-            do {
-                previousParent = parent;
-                if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInPlace()) {
-                    canBeInPlace = false;
+        bool canBeInPlace = true;
+        void* defaultPtr = parentEdge->getMemory().getData();
+        // Cannot be in-place after concat because concat is using different ptrs without offsets
+        auto parent = parentEdge->getParent();
+        NodePtr previousParent;
+        do {
+            previousParent = parent;
+            if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInPlace()) {
+                canBeInPlace = false;
+                break;
+            }
+
+            auto& parentEdges = parent->getParentEdges();
+            for (auto& edge : parentEdges) {
+                auto e = edge.lock();
+                if (!e)
+                    IE_THROW() << "Node " << parent->getName() << " contains empty parent edge";
+
+                if (e->getMemory().getData() == defaultPtr) {
+                    parent = e->getParent();
                    break;
                }
+            }
+        } while (previousParent != parent);
+        if (canBeInPlace)
+            changeEdgePtr(parentEdge, it.second);
+    }

-                auto& parentEdges = parent->getParentEdges();
-                for (auto& edge : parentEdges) {
-                    auto e = edge.lock();
-                    if (!e)
-                        IE_THROW() << "Node " << parent->getName() << " contains empty parent edge";
+    if (Graph::Status::ReadyDynamic == graph->getStatus()) {
+        const auto &outMemMngrMap = graph->outputNodesMemMngrMap;
+        for (auto&& item : outMemMngrMap) {
+            const auto& name = item.first;

-                    if (e->getMemory().getData() == defaultPtr) {
-                        parent = e->getParent();
-                        break;
+            // share intel_cpu::Tensor to Graph by injecting to corresponding ProxyMemoryMngr instance.
+            auto outputMemMngr = item.second;
+            OPENVINO_ASSERT(outputMemMngr, "proxy mem manager for output ", name, " is empty.");
+
+            auto controlBlockItr = outputControlBlocks.find(name);
+
+            if (controlBlockItr != outputControlBlocks.end()) {
+                auto output = outputNodesMap.find(name);
+                OPENVINO_ASSERT(outputNodesMap.end() != output, "Node with name: ", name, " is absent in the outputNodesMap");
+                auto parentEdge = output->second->getParentEdgeAt(0);
+                //avoid cyclic memory use
+                auto parentNode = parentEdge->getParent();
+                const auto& parentNodeInpEdges = parentNode->getParentEdges();
+                std::unordered_set<const void*> parentInputPtrs(parentNodeInpEdges.size());
+                for (auto&& edge : parentNodeInpEdges) {
+                    if (auto edgePtr = edge.lock()) {
+                        parentInputPtrs.insert(edgePtr->getMemoryPtr()->getData());
                    }
                }
-            } while (previousParent != parent);
-            if (canBeInPlace)
-                changeEdgePtr(parentEdge, it.second);
-            continue;
+
+                auto&& controlBlock = controlBlockItr->second;
+
+                std::shared_ptr<IMemoryMngr> memMngr = parentInputPtrs.count(controlBlock.rawPtr()) ? // same memory is used on the input and output
+                    controlBlock.nextMemMngr() : // then swap internal buffer to avoid data corruption
+                    controlBlock.currentMemMngr(); // else reuse the existing buffer
+
+                outputMemMngr->setMemMngr(memMngr);
+                DEBUG_LOG("reset proxy ", outputMemMngr, ", actual ", controlBlock.currentMemMngr(), " graph ", graph, " inferrequest ", this);
+                DEBUG_LOG(name, ", blob ", controlBlock.blob(), ", tensor ", controlBlock.tensor());
+            } else {
+                outputMemMngr->reset(); // switch to the internal memory since memory sharing is no longer possible
+            }
        }
-        IE_THROW() << "Cannot find input/output blob: " << it.first;
    }
 }

@@ -716,6 +770,7 @@ void InferRequest::SetBlob(const std::string& name, const InferenceEngine::Blob:
            externalPtr.erase(name);
        }
        _outputs[name] = data;
+        outputControlBlocks.erase(name); // now the memory is under user's control
    }
 }

@@ -774,22 +829,39 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) {
        if (_outputs.find(name) == _outputs.end()) {
            auto outputNode = modelOutputsMap.find(name);
            if (modelOutputsMap.find(name) != modelOutputsMap.end()) {
-                const auto shape = outputNode->second->get_input_partial_shape(0);
-                bool isDynamic = shape.is_dynamic();
+                const auto& model_shape = outputNode->second->get_input_partial_shape(0);
+                const auto& graph_shape = output->second->getInputShapeAtPort(0);
+
+                // WA, due to the transformations and constant folding, shape inference of the resulting model may
+                // have static shapes, while they are dynamic in the initial representation
+                const auto& shape = graph_shape.isDynamic() ? model_shape :
+                    (model_shape.is_dynamic() ? graph_shape.toPartialShape() : model_shape);
+
+                const bool isDynamic = shape.is_dynamic();

                if (!data) {
                    InferenceEngine::SizeVector dims;
                    if (isDynamic) {
-                        dims = InferenceEngine::SizeVector(shape.rank().get_length(), 0);
+                        const auto model_prec = InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0));
+                        const auto graph_prec = output->second->getParentEdgesAtPort(0)[0]->getMemory().getDesc().getPrecision();
+                        OutputControlBlock control_block{model_prec, Shape{shape}};
+
+                        DEBUG_LOG(name,
+                            ", blob ", control_block.blob(),
+                            ", tensor ", control_block.tensor(),
+                            ", memmngr ", control_block.tensor()->get_memory()->getMemoryMngr(),
+                            "memory object ", control_block.tensor()->get_memory().get());
+
+                        data = control_block.blob();
+                        if (model_prec == graph_prec) outputControlBlocks.emplace(std::make_pair(name, std::move(control_block)));
                    } else {
                        dims = shape.to_shape();
+
+                        InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)),
+                                                        dims, InferenceEngine::TensorDesc::getLayoutByRank(dims.size()));
+                        data = make_blob_with_precision(desc);
+                        data->allocate();
                    }
-
-                    InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)),
-                                                     dims, InferenceEngine::TensorDesc::getLayoutByRank(dims.size()));
-
-                    data = make_blob_with_precision(desc);
-                    data->allocate();
                } else {
                    const auto& blobDims = data->getTensorDesc().getDims();
                    // in static shape case is enough information that shapes are incompatible to throw exception
@@ -831,9 +903,23 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) {
        IE_THROW() << "Cannot find blob with name: " << name;
    }

+    DEBUG_LOG(name, ", blob ", data, ", ", static_cast<void*>(data->buffer()));
    return data;
 }

+void InferRequest::checkBlobs() {
+    for (auto const& input : _inputs) {
+        checkBlob(input.second, input.first, true);
+    }
+
+    // won't check dynamic output blobs as they are not allocated.
+    for (auto const& output : _outputs) {
+        const auto out_node = findOutputByNodeName(output.first);
+        const auto isDynamic = out_node && out_node->get_output_partial_shape(0).is_dynamic();
+        if (!isDynamic) checkBlob(output.second, output.first, false);
+    }
+}
+
 void InferRequest::PushInputData() {
    for (auto input : _inputs) {
        auto inputName = input.first;
@@ -845,5 +931,22 @@ void InferRequest::PushInputData() {
    }
 }

+InferRequestBase::OutputControlBlock::OutputControlBlock(const InferenceEngine::Precision& precision, const Shape& shape) {
+    dnnl::engine eng(dnnl::engine::kind::cpu, 0);
+    m_buffers[m_buffIndx] = std::make_shared<MemoryMngrWithReuse>();
+    m_proxyMemMngr = std::make_shared<ProxyMemoryMngr>(m_buffers[m_buffIndx]);
+
+    Shape memShape = shape.isDynamic() ?
+        Shape{VectorDims(shape.getRank(), 0)} : // this is a WA since the ITensor doesn't allow dyn shapes
+        Shape{shape};
+
+    CpuBlockedMemoryDescPtr desc =
+        std::make_shared<CpuBlockedMemoryDesc>(precision, memShape);
+
+    auto memory = std::make_shared<Memory>(eng, desc, m_proxyMemMngr);
+    m_tensor = std::make_shared<Tensor>(memory);
+    m_blob = tensor_to_blob({m_tensor, nullptr});
+}
+
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/infer_request.h
+++ b/src/plugins/intel_cpu/src/infer_request.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <map>
 #include <cpp_interfaces/interface/ie_iinfer_request_internal.hpp>
+#include "cpu_tensor.h"

 namespace ov {
 namespace intel_cpu {
@@ -52,12 +53,65 @@ protected:
    InferenceEngine::Precision normToInputSupportedPrec(const std::pair<const std::string, InferenceEngine::Blob::Ptr>& input) const;
    void pushInput(const std::string& inputName, InferenceEngine::Blob::Ptr& inputBlob, InferenceEngine::Precision dataType);

+protected:
+    class OutputControlBlock {
+    public:
+        using MemMngrPtr = std::shared_ptr<MemoryMngrWithReuse>;
+
+    public:
+        OutputControlBlock(const InferenceEngine::Precision& precision, const Shape& shape);
+
+        OutputControlBlock(const OutputControlBlock&) = delete;
+        OutputControlBlock& operator=(const OutputControlBlock&) = delete;
+
+        OutputControlBlock(OutputControlBlock&&) = default;
+        OutputControlBlock& operator=(OutputControlBlock&&) = default;
+
+        InferenceEngine::Blob::Ptr blob() const {
+            return m_blob;
+        }
+
+        std::shared_ptr<Tensor> tensor() const {
+            return m_tensor;
+        }
+
+        const void* rawPtr() const {
+            return m_tensor->get_memory()->getData();
+        }
+
+        MemMngrPtr currentMemMngr() const {
+            return m_buffers[m_buffIndx];
+        }
+
+        MemMngrPtr nextMemMngr() {
+            m_buffIndx ^= 0x1;
+            if (!m_buffers[m_buffIndx]) {
+                m_buffers[m_buffIndx] = std::make_shared<MemoryMngrWithReuse>();
+            }
+            return m_buffers[m_buffIndx];
+        }
+
+        void update() {
+            m_proxyMemMngr->setMemMngr(currentMemMngr());
+        }
+
+    private:
+        std::shared_ptr<Tensor> m_tensor = nullptr;
+        InferenceEngine::Blob::Ptr m_blob = nullptr;
+        ProxyMemoryMngrPtr m_proxyMemMngr = nullptr;
+        std::array<MemMngrPtr, 2> m_buffers;
+        int m_buffIndx = 0;
+    };
+
+protected:
    virtual void initBlobs() = 0;
    virtual void PushInputData() = 0;

    Graph* graph = nullptr;
    std::unordered_map<std::string, InferenceEngine::Blob::Ptr> externalPtr;

+    std::unordered_map<std::string, OutputControlBlock> outputControlBlocks;
+
 private:
    void PushStates();
    void PullStates();
@@ -97,6 +151,8 @@ public:
    void SetBlobsImpl(const std::string& name, const InferenceEngine::BatchedBlob::Ptr& batched_blob) override;
    InferenceEngine::Blob::Ptr GetBlob(const std::string& name) override;

+    void checkBlobs() override;
+
 private:
    void PushInputData() override;
    void initBlobs() override;
--- a/src/plugins/intel_cpu/src/nodes/reorder.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp
@@ -338,9 +338,6 @@ void Reorder::execute(dnnl::stream strm) {
    } else if (canUseNcsp2Nspc) {
        optimizedNcsp2Nspc();
    } else {
-        // src_blocked->setDataHandle(getParentEdgeAt(0)->getMemory().GetData());
-        // dst_blocked->setDataHandle(getChildEdgeAt(0)->getMemory().GetData());
-
        if (prim) {
            prim.execute(strm, primArgs);
        } else {
--- a/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp
+++ b/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "proxy_mem_mgr.h"
+#include "utils/debug_capabilities.h"
+
+using namespace ov::intel_cpu;
+
+void ProxyMemoryMngr::setMemMngr(std::shared_ptr<IMemoryMngr> pMngr) {
+    OPENVINO_ASSERT(pMngr, "Attempt to set null memory manager to a ProxyMemoryMngr object");
+    if (m_pMngr == pMngr) {
+        return;
+    }
+
+    m_pMngr = pMngr;
+    m_pMngr->resize(m_size);
+    notifyUpdate();
+}
+
+void ProxyMemoryMngr::reset() {
+    if (!m_pOrigMngr) {
+        m_pOrigMngr = std::make_shared<MemoryMngrWithReuse>();
+    }
+
+    if (m_pMngr == m_pOrigMngr) {
+        return;
+    }
+
+    m_pMngr = m_pOrigMngr;
+    m_pMngr->resize(m_size);
+    notifyUpdate();
+}
+
+void* ProxyMemoryMngr::getRawPtr() const noexcept {
+    return m_pMngr->getRawPtr();
+}
+
+void ProxyMemoryMngr::setExtBuff(void* ptr, size_t size) {
+    m_pMngr->setExtBuff(ptr, size);
+    notifyUpdate();
+}
+
+bool ProxyMemoryMngr::resize(size_t size) {
+    auto res = m_pMngr->resize(size);
+    DEBUG_LOG(this, ", ", m_pMngr, " size ", m_size, " -> ", size, " resized? ", res, " RawPtr ", getRawPtr());
+    m_size = size;
+    notifyUpdate();
+    return res;
+}
+
+bool ProxyMemoryMngr::hasExtBuffer() const noexcept {
+    return m_pMngr->hasExtBuffer();
+}
+
+void ProxyMemoryMngr::registerMemory(Memory* memPtr) {
+    if (memPtr) {
+        m_setMemPtrs.insert(memPtr);
+    }
+}
+
+void ProxyMemoryMngr::unregisterMemory(Memory* memPtr) {
+    if (memPtr) {
+        m_setMemPtrs.erase(memPtr);
+    }
+}
+
+void ProxyMemoryMngr::notifyUpdate() {
+    for (auto& item : m_setMemPtrs) {
+        if (item) {
+            item->update();
+        }
+    }
+}
--- a/src/plugins/intel_cpu/src/proxy_mem_mgr.h
+++ b/src/plugins/intel_cpu/src/proxy_mem_mgr.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpu_memory.h"
+
+namespace ov {
+namespace intel_cpu {
+
+/**
+ * @brief A proxy object that additionally implements observer pattern
+ */
+class ProxyMemoryMngr : public IMemoryMngrObserver {
+public:
+    ProxyMemoryMngr() : m_pOrigMngr(std::make_shared<MemoryMngrWithReuse>()), m_pMngr(m_pOrigMngr) {}
+    explicit ProxyMemoryMngr(std::shared_ptr<IMemoryMngr> pMngr) {
+        OPENVINO_ASSERT(pMngr, "Memory manager is uninitialized");
+        m_pMngr = pMngr;
+    }
+
+    void* getRawPtr() const noexcept override;
+    void setExtBuff(void* ptr, size_t size) override;
+    bool resize(size_t size) override;
+    bool hasExtBuffer() const noexcept override;
+
+    void registerMemory(Memory* memPtr) override;
+    void unregisterMemory(Memory* memPtr) override;
+
+    void setMemMngr(std::shared_ptr<IMemoryMngr> pMngr);
+    void reset();
+
+private:
+    void notifyUpdate();
+
+    // We keep the original MemMngr as may fallback to copy output.
+    std::shared_ptr<IMemoryMngr> m_pOrigMngr = nullptr;
+    std::shared_ptr<IMemoryMngr> m_pMngr = nullptr;
+
+    std::unordered_set<Memory*> m_setMemPtrs;
+
+    // WA: resize stage might not work because there is no shape change,
+    // but the underlying actual memory manager changes.
+    size_t m_size = 0ul;
+};
+
+using ProxyMemoryMngrPtr = std::shared_ptr<ProxyMemoryMngr>;
+using ProxyMemoryMngrCPtr = std::shared_ptr<const ProxyMemoryMngr>;
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp
@@ -29,7 +29,7 @@ bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) {


    const auto& loop_manager = linear_ir.get_loop_manager();
-    const auto dim_idx = 1;
+    const size_t dim_idx = 1;

    auto blocking_loop_exists = [&](const ov::snippets::lowered::ExpressionPtr& expr,
                                    const std::shared_ptr<ov::intel_cpu::BrgemmCPU>& brgemm) {
--- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
+++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp
@@ -254,7 +254,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) {
    } else {
        // no SPD yet, use orginal shapes
        comma = "";
-        for (int i = 0; i < node.getOriginalOutputPrecisions().size(); i++) {
+        for (size_t i = 0; i < node.getOriginalOutputPrecisions().size(); i++) {
            auto shape = node.getOutputShapeAtPort(i);
            std::string prec_name = "Undef";
            prec_name = node.getOriginalOutputPrecisionAtPort(i).name();
@@ -282,6 +282,10 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) {
            auto n = edge->getParent();
            os << comma;
            os << node_id(*edge->getParent());
+            auto ptr = edge->getMemoryPtr();
+            if (ptr) {
+                os << "_" << ptr->getData();
+            }
            if (!is_single_output_port(*n))
                os << "[" << edge->getInputNum() << "]";
            comma = ",";
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <limits.h>
+#include "behavior/ov_infer_request/iteration_chaining.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace ov::test::behavior;
+
+namespace {
+
+const std::vector<ov::AnyMap> configs = {
+    {}
+};
+
+const std::vector<ov::AnyMap> HeteroConfigs = {
+    {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}
+};
+
+const std::vector<ov::AnyMap> AutoConfigs = {
+    {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVIterationChaining,
+                        ::testing::Combine(
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::ValuesIn(configs)),
+                        OVIterationChaining::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVIterationChaining,
+                        ::testing::Combine(
+                                ::testing::Values(CommonTestUtils::DEVICE_HETERO),
+                                ::testing::ValuesIn(HeteroConfigs)),
+                        OVIterationChaining::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVIterationChaining,
+                        ::testing::Combine(
+                                ::testing::Values(CommonTestUtils::DEVICE_AUTO),
+                                ::testing::ValuesIn(AutoConfigs)),
+                        OVIterationChaining::getTestCaseName);
+
+}  // namespace
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp
@@ -90,7 +90,8 @@ protected:
        auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(inputParams));
        auto customOp = std::make_shared<CustomOp>(paramOuts);

-        ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(customOp)};
+        ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(customOp->output(0)),
+                                    std::make_shared<ngraph::opset3::Result>(customOp->output(1))};
        function = std::make_shared<ngraph::Function>(results, inputParams, "customOpTest");
    }

--- a/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp
+++ b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp
@@ -0,0 +1,258 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gmock/gmock-spec-builders.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest-param-test.h>
+#include <gtest/gtest.h>
+
+#include <openvino/core/shape.hpp>
+#include <openvino/core/strides.hpp>
+#include <openvino/core/type/element_type.hpp>
+#include "openvino/core/except.hpp"
+#include "openvino/core/partial_shape.hpp"
+
+#include "cpu_memory.h"
+#include "cpu_tensor.h"
+#include "openvino/runtime/itensor.hpp"
+
+#include "ie_ngraph_utils.hpp"
+
+using namespace ov::intel_cpu;
+using namespace InferenceEngine;
+
+using CPUTensorTest = ::testing::Test;
+
+class MockBlockedMemoryDesc : public BlockedMemoryDesc {
+public:
+    MockBlockedMemoryDesc(const Shape& _shape) : MemoryDesc(_shape, Blocked) {}
+
+    MOCK_METHOD(InferenceEngine::Precision, getPrecision, (), (const, override));
+    MOCK_METHOD(MemoryDescPtr, clone, (), (const, override));
+    MOCK_METHOD(size_t, getOffsetPadding, (), (const, override));
+
+    MOCK_METHOD(MemoryDescPtr, cloneWithNewDimsImp, (const VectorDims&), (const, override));
+
+    MOCK_METHOD(MemoryDescPtr, cloneWithNewPrecision, (const InferenceEngine::Precision), (const, override));
+    MOCK_METHOD(bool, isCompatible, (const MemoryDesc&), (const, override));
+
+    MOCK_METHOD(bool, hasLayoutType, (LayoutType), (const, override));
+
+    MOCK_METHOD(size_t, getMaxMemSize, (), (const, override));
+
+    MOCK_METHOD(const VectorDims&, getBlockDims, (), (const, override));
+    MOCK_METHOD(const VectorDims&, getOrder, (), (const, override));
+    MOCK_METHOD(const VectorDims&, getOffsetPaddingToData, (), (const, override));
+    MOCK_METHOD(const VectorDims&, getStrides, (), (const, override));
+    MOCK_METHOD(bool, blocksExtended, (), (const, override));
+    MOCK_METHOD(size_t, getPaddedElementsCount, (), (const, override));
+    MOCK_METHOD(bool, isCompatible, (const BlockedMemoryDesc &, CmpMask), (const, override));
+
+    MOCK_METHOD(void, setPrecision, (InferenceEngine::Precision), (override));
+
+    MOCK_METHOD(size_t, getCurrentMemSizeImp, (), (const, override));
+
+    MOCK_METHOD(size_t, getElementOffset, (size_t), (const, override));
+    MOCK_METHOD(bool, canComputeMemSizeZeroDims, (), (const, override));
+    MOCK_METHOD(bool, isDefinedImp, (), (const, override));
+};
+
+class MockIMemory : public IMemory {
+public:
+    MockIMemory(MemoryDescPtr desc) : m_pMemDesc(desc) {}
+    MockIMemory(const MemoryDesc& desc) : m_pMemDesc(desc.clone()) {}
+
+    MOCK_METHOD(bool, isAllocated, (), (const, noexcept, override));
+    MOCK_METHOD(MemoryDesc&, getDesc, (), (const, override));
+    MOCK_METHOD(MemoryDescPtr, getDescPtr, (), (const, override));
+
+    MOCK_METHOD(size_t, getSize, (), (const, override));
+    MOCK_METHOD(const Shape&, getShape, (), (const, override));
+    MOCK_METHOD(const VectorDims&, getStaticDims, (), (const, override));
+
+    MOCK_METHOD(void, redefineDesc, (MemoryDescPtr), (override));
+    MOCK_METHOD(void, load, (const IMemory&, bool), (const, override));
+    MOCK_METHOD(MemoryMngrPtr, getMemoryMngr, (), (const, override));
+
+    MOCK_METHOD(dnnl::memory, getPrimitive, (), (const, override));
+    MOCK_METHOD(void, nullify, (), (override));
+    MOCK_METHOD(void*, getData, (), (const, override));
+
+    void set_memDesc(MemoryDescPtr memdesc) { m_pMemDesc = memdesc; }
+    void set_memDesc(const MemoryDesc& memdesc) { m_pMemDesc = memdesc.clone(); }
+    MemoryDesc& get_memDesc() const { return *m_pMemDesc; }
+    MemoryDescPtr get_memDescPtr() { return m_pMemDesc; }
+
+private:
+    MemoryDescPtr m_pMemDesc;
+};
+
+// helper to get byte strides from strides.
+static ov::Strides byte_strides(const ov::Strides& strides, const ov::element::Type& type) {
+    ov::Strides byte_strides(strides.size());
+    for (size_t i = 0; i < strides.size(); ++i)
+        byte_strides[i] = strides[i] * type.size();
+    return byte_strides;
+}
+
+// helper to create Memory of ncsp layout.
+inline MemoryDescPtr create_memdesc(Precision prec, const Shape& shape, const VectorDims& strides = {}) {
+    ov::Shape ov_shape = shape.toPartialShape().to_shape();
+    const std::size_t totalSize = ov::shape_size(ov_shape);
+    auto elem_type = InferenceEngine::details::convertPrecision(prec);
+
+    auto memdesc = std::make_shared<MockBlockedMemoryDesc>(shape);
+    ::testing::Mock::AllowLeak(memdesc.get());
+
+    EXPECT_CALL(*memdesc, hasLayoutType(::testing::Eq(LayoutType::ncsp))).WillRepeatedly(::testing::Return(true));
+
+    EXPECT_CALL(*memdesc, getPrecision).WillRepeatedly(::testing::Return(prec));
+    EXPECT_CALL(*memdesc, getStrides).WillRepeatedly(::testing::ReturnRef(strides));
+
+    EXPECT_CALL(*memdesc, canComputeMemSizeZeroDims).WillRepeatedly(::testing::Return(true));
+    EXPECT_CALL(*memdesc, isDefinedImp).WillRepeatedly(::testing::Return(true));
+    EXPECT_CALL(*memdesc, getCurrentMemSizeImp).WillRepeatedly(::testing::Return(totalSize * elem_type.size()));
+
+    return memdesc;
+}
+
+inline MemoryPtr create_memory(MemoryDescPtr memdesc) {
+    auto memptr = std::make_shared<MockIMemory>(memdesc);
+    ::testing::Mock::AllowLeak(memptr.get());
+
+    // getDesc
+    EXPECT_CALL(*memptr, getDescPtr)
+        .Times(::testing::AnyNumber())
+        .WillRepeatedly([memptr]() {
+                        return memptr->get_memDescPtr();
+                    });
+    EXPECT_CALL(*memptr, getDesc).WillRepeatedly(::testing::ReturnRef(memptr->get_memDesc()));
+
+    // data
+    static size_t memSize = 0;
+    EXPECT_CALL(*memptr, getData)
+        .WillRepeatedly([memptr]() {
+                        auto memdesc = memptr->get_memDescPtr();
+                        auto required = memdesc->getCurrentMemSize();
+                        if (memSize >= required) {
+                            return reinterpret_cast<void*>(memSize);
+                        } else {
+                            memSize = required;
+                            return reinterpret_cast<void*>(required);
+                        }
+                    });
+
+    // redefineDesc
+    ON_CALL(*memptr, redefineDesc).WillByDefault([memptr](MemoryDescPtr desc) {
+                memptr->set_memDesc(desc);
+            });
+    EXPECT_CALL(*memptr, redefineDesc).Times(::testing::AtLeast(1));
+
+    return memptr;
+}
+
+TEST_F(CPUTensorTest, canCreateTensor) {
+    Shape shape{4, 3, 2};
+    ov::Shape ov_shape = shape.toPartialShape().to_shape();
+    auto strides = ov::Strides({6, 2, 1});
+    const std::size_t totalSize = ov::shape_size(ov_shape);
+    ov::element::Type elem_type = ov::element::f32;
+
+    auto memptr = create_memory(create_memdesc(Precision::FP32, shape, strides));
+    {
+        std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(memptr);
+        ASSERT_EQ(totalSize, t->get_size());
+        ASSERT_NE(nullptr, t->data());
+        ASSERT_EQ(elem_type, t->get_element_type());
+        ASSERT_EQ(ov_shape, t->get_shape());
+        ASSERT_NE(ov_shape, t->get_strides());
+        ASSERT_EQ(byte_strides(ov::Strides({6, 2, 1}), t->get_element_type()), t->get_strides());
+        ASSERT_EQ(elem_type.size() * totalSize, t->get_byte_size());
+        ASSERT_THROW(t->data(ov::element::i64), ov::Exception);
+        ASSERT_THROW(t->data<std::int32_t>(), ov::Exception);
+    }
+}
+
+TEST_F(CPUTensorTest, canAccessF16Tensor) {
+    Shape shape = {4, 3, 2};
+    auto strides = ov::Strides({6, 2, 1});
+
+    auto memptr = create_memory(create_memdesc(Precision::FP16, shape, strides));
+    {
+        std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(memptr);
+        EXPECT_NE(nullptr, t->data());
+        ASSERT_EQ(ov::element::f16, t->get_element_type());
+        EXPECT_NO_THROW(t->data(ov::element::f16));
+        EXPECT_NO_THROW(t->data<ov::float16>());
+        EXPECT_THROW(t->data<ov::bfloat16>(), ov::Exception);
+        EXPECT_THROW(t->data<std::uint16_t>(), ov::Exception);
+        EXPECT_THROW(t->data<std::int16_t>(), ov::Exception);
+    }
+}
+
+// SetShape
+TEST_F(CPUTensorTest, canSetShape) {
+    const Shape origShape = {1, 2, 3};
+    const ov::Shape ov_origShape = origShape.toPartialShape().to_shape();
+    auto strides = ov::Strides({6, 3, 1});
+    auto memdesc = create_memdesc(Precision::FP32, origShape, strides);
+    auto memptr = create_memory(memdesc);
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(memptr);
+
+    const Shape newShape({4, 5, 6});
+    const ov::Shape ov_newShape = newShape.toPartialShape().to_shape();
+    auto new_strides = ov::Strides{30, 6, 1};
+    auto new_memdesc = create_memdesc(Precision::FP32, newShape, new_strides);
+
+    // set_shape to a bigger memory
+    {
+        auto blocked_memdesc = dynamic_cast<MockBlockedMemoryDesc*>(memdesc.get());
+        EXPECT_CALL(*blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(new_memdesc));
+
+        const void* orig_data = t->data();
+        ASSERT_EQ(t->get_shape(), ov_origShape);
+        ASSERT_NO_THROW(t->set_shape(ov_newShape));
+        ASSERT_EQ(ov_newShape, t->get_shape());
+        ASSERT_EQ(byte_strides(ov::row_major_strides(ov_newShape), t->get_element_type()), t->get_strides());
+        ASSERT_NE(orig_data, t->data());
+    }
+
+    // set_shape for smaller memory - does not perform reallocation
+    {
+        auto new_blocked_memdesc = dynamic_cast<MockBlockedMemoryDesc*>(new_memdesc.get());
+        EXPECT_CALL(*new_blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(memdesc));
+        const void* orig_data = t->data();
+        t->set_shape(ov_origShape);
+        ASSERT_EQ(ov_origShape, t->get_shape());
+        ASSERT_EQ(orig_data, t->data());
+    }
+}
+
+TEST_F(CPUTensorTest, canSyncMemoryAndTensor) {
+    const Shape origShape = {1, 2, 3};
+    const ov::Shape ov_origShape = origShape.toPartialShape().to_shape();
+    auto strides = ov::Strides({6, 3, 1});
+    auto memdesc = create_memdesc(Precision::FP32, origShape, strides);
+    auto memptr = create_memory(memdesc);
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(memptr);
+
+    ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape());
+    ASSERT_EQ(byte_strides(memptr->getDescWithType<BlockedMemoryDesc>()->getStrides(), t->get_element_type()), t->get_strides());
+
+    const Shape newShape({4, 5, 6});
+    const ov::Shape ov_newShape = newShape.toPartialShape().to_shape();
+    auto new_strides = ov::Strides{30, 6, 1};
+    auto new_memdesc = create_memdesc(Precision::FP32, newShape, new_strides);
+
+    // reallocate memory out boundary of tensor instance
+    {
+        auto blocked_memdesc = dynamic_cast<MockBlockedMemoryDesc*>(memdesc.get());
+        EXPECT_CALL(*blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(new_memdesc));
+
+        auto desc2 = memptr->getDescPtr()->cloneWithNewDims(newShape.getStaticDims(), true);
+        memptr->redefineDesc(desc2);
+        ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape());
+        ASSERT_EQ(byte_strides(memptr->getDescWithType<BlockedMemoryDesc>()->getStrides(), t->get_element_type()), t->get_strides());
+    }
+}
--- a/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp
+++ b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp
@@ -0,0 +1,156 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gmock/gmock-spec-builders.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest-param-test.h>
+#include <gtest/gtest.h>
+
+#include <openvino/core/shape.hpp>
+#include <openvino/core/strides.hpp>
+#include <openvino/core/type/element_type.hpp>
+#include "openvino/core/except.hpp"
+#include "openvino/core/partial_shape.hpp"
+
+#include "cpu_memory.h"
+#include "cpu_tensor.h"
+#include "openvino/runtime/itensor.hpp"
+
+using namespace ov::intel_cpu;
+using namespace InferenceEngine;
+
+using CPUTensorExtTest = ::testing::Test;
+
+static ov::Strides byteStrides(const ov::Strides& strides, const ov::element::Type& type) {
+    ov::Strides byte_strides(strides.size());
+    for (size_t i = 0; i < strides.size(); ++i)
+        byte_strides[i] = strides[i] * type.size();
+    return byte_strides;
+}
+
+inline MemoryPtr create_memory(Precision prc, const Shape& shape) {
+    dnnl::engine eng(dnnl::engine::kind::cpu, 0);
+    CpuBlockedMemoryDescPtr desc;
+    desc = std::make_shared<CpuBlockedMemoryDesc>(prc, shape);
+    return std::make_shared<Memory>(eng, desc);
+}
+
+TEST_F(CPUTensorExtTest, canCreateTensor) {
+    Shape shape{4, 3, 2};
+    ov::Shape ov_shape = shape.toPartialShape().to_shape();
+
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(create_memory(Precision::FP32, shape));
+    const std::size_t totalSize = ov::shape_size(ov_shape);
+    ASSERT_EQ(totalSize, t->get_size());
+    ASSERT_NE(nullptr, t->data());
+    ASSERT_EQ(ov::element::f32, t->get_element_type());
+    ASSERT_EQ(ov_shape, t->get_shape());
+    ASSERT_NE(ov_shape, t->get_strides());
+    ASSERT_EQ(byteStrides(ov::Strides({6, 2, 1}), t->get_element_type()), t->get_strides());
+    ASSERT_EQ(ov::element::f32.size() * totalSize, t->get_byte_size());
+    ASSERT_THROW(t->data(ov::element::i64), ov::Exception);
+    ASSERT_THROW(t->data<std::int32_t>(), ov::Exception);
+}
+
+TEST_F(CPUTensorExtTest, canAccessF16Tensor) {
+    Shape shape = {4, 3, 2};
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(create_memory(Precision::FP16, shape));
+    EXPECT_NE(nullptr, t->data());
+    ASSERT_EQ(ov::element::f16, t->get_element_type());
+    EXPECT_NO_THROW(t->data(ov::element::f16));
+    EXPECT_NO_THROW(t->data<ov::float16>());
+    EXPECT_THROW(t->data<ov::bfloat16>(), ov::Exception);
+    EXPECT_THROW(t->data<std::uint16_t>(), ov::Exception);
+    EXPECT_THROW(t->data<std::int16_t>(), ov::Exception);
+}
+
+// SetShape
+TEST_F(CPUTensorExtTest, canSetShape) {
+    const ov::Shape origShape({1, 2, 3});
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(create_memory(Precision::FP32, {1, 2, 3}));
+    const ov::Shape newShape({4, 5, 6});
+
+    const void* orig_data = t->data();
+    ASSERT_EQ(t->get_shape(), origShape);
+    ASSERT_NO_THROW(t->set_shape({4, 5, 6}));
+    ASSERT_EQ(newShape, t->get_shape());
+    ASSERT_EQ(byteStrides(ov::row_major_strides(newShape), t->get_element_type()), t->get_strides());
+    ASSERT_NE(orig_data, t->data());
+
+    // set_shape for smaller memory - does not perform reallocation
+    {
+        orig_data = t->data();
+        t->set_shape(origShape);
+        ASSERT_EQ(origShape, t->get_shape());
+        ASSERT_EQ(orig_data, t->data());
+    }
+}
+
+TEST_F(CPUTensorExtTest, emptySize) {
+    ov::PartialShape pshape{0, 3, 2};
+    Shape shape{pshape};
+    const ov::Shape origShape({0, 3, 2});
+
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(create_memory(Precision::FP32, shape));
+
+    ASSERT_EQ(ov::element::f32, t->get_element_type());
+    ASSERT_EQ(0, t->get_size());
+    ASSERT_EQ(0, t->get_byte_size());
+    ASSERT_EQ(origShape, t->get_shape());
+    ASSERT_EQ(byteStrides(ov::Strides({0, 0, 0}), t->get_element_type()), t->get_strides());
+    EXPECT_NO_THROW(t->data());
+}
+
+TEST_F(CPUTensorExtTest, canCreateTensorWithDynamicShape) {
+    ov::PartialShape pshape{-1, 3, 2};
+    Shape shape{pshape};
+
+    std::shared_ptr<ov::ITensor> t;
+
+    // construct with memory with dynamic shape
+    ASSERT_NO_THROW(t = std::make_shared<ov::intel_cpu::Tensor>(create_memory(Precision::FP32, shape)));
+    ASSERT_THROW(t->get_shape(), ov::Exception);
+    ASSERT_THROW(t->get_strides(), ov::Exception);
+
+    // change memory to dynamic shape
+    {
+        auto memptr = create_memory(Precision::FP32, {4, 3, 2});
+        ASSERT_NO_THROW(t = std::make_shared<ov::intel_cpu::Tensor>(memptr));
+
+        ov::PartialShape pshape{{1, 10}, 3, 2};
+        CpuBlockedMemoryDescPtr desc2 = std::make_shared<CpuBlockedMemoryDesc>(Precision::FP32, Shape(pshape));
+        memptr->redefineDesc(desc2);
+        ASSERT_THROW(t->get_shape(), ov::Exception);
+        ASSERT_THROW(t->get_strides(), ov::Exception);
+    }
+
+    // set_shape
+    const ov::Shape newShape({4, 0, 2});
+    ASSERT_NO_THROW(t = std::make_shared<ov::intel_cpu::Tensor>(create_memory(Precision::FP32, {4, 3, 2})));
+
+    const void* orig_data = t->data();
+    ASSERT_NO_THROW(t->set_shape({4, 0, 2}));
+    ASSERT_EQ(newShape, t->get_shape());
+    ASSERT_EQ(ov::Strides({0, 0, 0}), t->get_strides());
+    ASSERT_EQ(orig_data, t->data());
+}
+
+TEST_F(CPUTensorExtTest, canSyncMemoryAndTensor) {
+    Shape orig_shape{4, 3, 2};
+
+    auto memptr = create_memory(Precision::FP32, orig_shape);
+    std::shared_ptr<ov::ITensor> t = std::make_shared<ov::intel_cpu::Tensor>(memptr);
+    ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape());
+    ASSERT_EQ(byteStrides(memptr->getDescWithType<BlockedMemoryDesc>()->getStrides(), t->get_element_type()), t->get_strides());
+
+    // reallocate memory out boundary of tensor instance
+    {
+        Shape new_shape{1, 5, 2};
+
+        auto desc2 = memptr->getDescPtr()->cloneWithNewDims(new_shape.getStaticDims(), true);
+        memptr->redefineDesc(desc2);
+        ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape());
+        ASSERT_EQ(byteStrides(memptr->getDescWithType<BlockedMemoryDesc>()->getStrides(), t->get_element_type()), t->get_strides());
+    }
+}
--- a/src/plugins/intel_cpu/tools/dump_check/dump_check.py
+++ b/src/plugins/intel_cpu/tools/dump_check/dump_check.py
@@ -379,6 +379,10 @@ def compare_dump_file(ieb_file1, ieb_file2, visualize):
    else:
        diff_abs = np.abs(ieb1.value - ieb2.value)

+    if not np.all(diff_abs.shape):
+        print(" Shape{} has dim 0".format(ieb1.shape))
+        return
+
    max_abs = np.amax(diff_abs)
    max_idx = np.where(diff_abs >= max_abs)
    max_org = np.abs(ieb2.value)[max_idx]
--- a/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/iteration_chaining.hpp
+++ b/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/iteration_chaining.hpp
@@ -0,0 +1,47 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "base/behavior_test_utils.hpp"
+#include "openvino/core/attribute_visitor.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/core/node.hpp"
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/core/rank.hpp"
+#include "openvino/core/shape.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/core/type/element_type_traits.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/runtime/infer_request.hpp"
+#include "openvino/runtime/tensor.hpp"
+
+namespace ov {
+namespace test {
+namespace behavior {
+
+struct OVIterationChaining : public OVInferRequestTests {
+    static std::string getTestCaseName(const testing::TestParamInfo<InferRequestParams>& obj);
+    void Run();
+
+    void SetUp() override;
+    void TearDown() override;
+
+    ov::InferRequest req;
+
+private:
+    static std::shared_ptr<ov::Model> getIterativeFunction();
+    bool checkOutput(const ov::runtime::Tensor& in, const ov::runtime::Tensor& actual);
+};
+
+}  // namespace behavior
+}  // namespace test
+}  // namespace ov
--- a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp
+++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp
@@ -188,6 +188,36 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkSetOutputShapeBeforeInfer)
    ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputname)));
 }

+TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkGetOutputThenSetOutputTensorPreAllocatedMemoryBeforeInfer) {
+    const std::string tensor_name = "input_tensor";
+    const ov::Shape refShape = inOutShapes[0].first;
+    const ov::Shape refOutShape = inOutShapes[0].second;
+    std::map<std::string, ov::PartialShape> shapes;
+    shapes[tensor_name] = {ov::Dimension::dynamic(), 4, 20, 20};
+    OV_ASSERT_NO_THROW(function->reshape(shapes));
+    // Load ov::Model to target plugins
+    auto execNet = ie->compile_model(function, target_device, configuration);
+    // Create InferRequest
+    ov::InferRequest req;
+    ov::runtime::Tensor tensor;
+    const std::string outputname = function->outputs().back().get_any_name();
+    OV_ASSERT_NO_THROW(req = execNet.create_infer_request());
+    tensor = ov::test::utils::create_and_fill_tensor(element::f32, refShape, 100, -50);
+    OV_ASSERT_NO_THROW(req.set_tensor("input_tensor", tensor));
+    // first, get ouput tensor
+    OV_ASSERT_NO_THROW(req.infer());
+    ASSERT_EQ(req.get_tensor(outputname).get_shape(), refOutShape);
+    ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputname)));
+    // then, set output tensor
+    float ptr[5000];
+    ov::runtime::Tensor otensor(element::f32, refOutShape, ptr);
+    OV_ASSERT_NO_THROW(req.set_tensor(outputname, otensor));
+    OV_ASSERT_NO_THROW(req.infer());
+    ASSERT_EQ(req.get_tensor(outputname).data<float>(), ptr);
+    ASSERT_EQ(req.get_tensor(outputname).get_shape(), refOutShape);
+    ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputname)));
+}
+
 TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithoutSetShape) {
    const std::string tensor_name = "input_tensor";
    std::map<std::string, ov::PartialShape> shapes;
--- a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/iteration_chaining.cpp
+++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/iteration_chaining.cpp
@@ -0,0 +1,121 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <chrono>
+#include <gtest/gtest.h>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "base/ov_behavior_test_utils.hpp"
+#include "openvino/core/attribute_visitor.hpp"
+#include "openvino/core/node.hpp"
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/core/rank.hpp"
+#include "openvino/core/shape.hpp"
+#include "openvino/core/type/element_type.hpp"
+#include "openvino/core/type/element_type_traits.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/core/model.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "openvino/runtime/infer_request.hpp"
+#include "openvino/runtime/tensor.hpp"
+#include "behavior/ov_infer_request/iteration_chaining.hpp"
+
+namespace ov {
+namespace test {
+namespace behavior {
+std::string OVIterationChaining::getTestCaseName(const testing::TestParamInfo<InferRequestParams>& obj) {
+    return OVInferRequestTests::getTestCaseName(obj);
+}
+
+std::shared_ptr<ov::Model> OVIterationChaining::getIterativeFunction() {
+    const ov::PartialShape pshape{-1, 16};
+    auto params = ngraph::builder::makeDynamicParams(element::Type_t::f32, {pshape});
+    params[0]->get_output_tensor(0).set_names({"input_tensor_0"});
+    params[0]->set_friendly_name("param_0");
+    auto concat_const = ngraph::builder::makeConstant(element::Type_t::f32, {1, 16}, std::vector<float>{}, true);
+    auto concat = ngraph::builder::makeConcat({params[0], concat_const}, 0 /*axis*/);
+    auto eltwise_const = ngraph::builder::makeConstant(element::Type_t::f32, {1, 16}, std::vector<float>{}, true);
+    auto eltwise = ngraph::builder::makeEltwise(concat, eltwise_const, ngraph::helpers::EltwiseTypes::ADD);
+    concat->get_output_tensor(0).set_names({"result_tensor_0"});
+    concat->set_friendly_name("result_0");
+    eltwise->get_output_tensor(0).set_names({"result_tensor_1"});
+    eltwise->set_friendly_name("result_1");
+
+    return std::make_shared<ov::Model>(ov::NodeVector{concat, eltwise}, ov::ParameterVector(params));
+}
+
+void OVIterationChaining::SetUp() {
+    std::tie(target_device, configuration) = this->GetParam();
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    APIBaseTest::SetUp();
+    function = getIterativeFunction();
+    ov::AnyMap params;
+    for (auto&& v : configuration) {
+        params.emplace(v.first, v.second);
+    }
+    execNet = core->compile_model(function, target_device, params);
+
+    try {
+        req = execNet.create_infer_request();
+    } catch (const std::exception& ex) {
+        FAIL() << "Can't Create Infer Requiest in SetUp \nException [" << ex.what() << "]"
+               << std::endl;
+    }
+}
+
+void OVIterationChaining::TearDown() {
+    req = {};
+    OVInferRequestTests::TearDown();
+}
+
+bool OVIterationChaining::checkOutput(const ov::runtime::Tensor& in, const ov::runtime::Tensor& actual) {
+    bool result = true;
+    auto net = core->compile_model(function, CommonTestUtils::DEVICE_TEMPLATE);
+    ov::InferRequest req;
+    req = net.create_infer_request();
+    auto tensor = req.get_tensor(function->inputs().back().get_any_name());
+    tensor.set_shape(in.get_shape());
+    for (int i = 0; i < in.get_size(); i++) {
+        tensor.data<float>()[i] = in.data<float>()[i];
+    }
+    req.infer();
+    for (int i = 0; i < actual.get_size(); i++) {
+        if (fabs(req.get_output_tensor(0).data<float>()[i] - actual.data<float>()[i]) > std::numeric_limits<float>::epsilon())
+            return false;
+    }
+    return result;
+}
+
+void OVIterationChaining::Run() {
+    // perform iteration chaining by iteratively
+    // setting input tensor to be output tensor of last inference, and
+    // beginnign with an empty tensor
+    ov::Tensor t0(element::Type_t::f32, {0, 16});
+
+    OV_ASSERT_NO_THROW(req.set_tensor("input_tensor_0", t0));
+    for (size_t i = 0; i < 10; i++) {
+        OV_ASSERT_NO_THROW(req.infer());
+        ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor_0"), req.get_tensor("result_tensor_0")));
+
+        const auto t1 = req.get_tensor("result_tensor_0");
+        OV_ASSERT_NO_THROW(req.set_tensor("input_tensor_0", t1));
+    }
+    ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor_0"), req.get_tensor("result_tensor_0")));
+}
+
+TEST_P(OVIterationChaining, Simple) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Run();
+}
+
+}  // namespace behavior
+}  // namespace test
+}  // namespace ov