diff --git a/src/common/snippets/src/pass/common_optimizations.cpp b/src/common/snippets/src/pass/common_optimizations.cpp index f9197afc4ec..ade3d12fda4 100644 --- a/src/common/snippets/src/pass/common_optimizations.cpp +++ b/src/common/snippets/src/pass/common_optimizations.cpp @@ -105,7 +105,7 @@ void CommonOptimizations::SplitDimensionM(const std::shared_ptr= optimal_m_dim; + return batch_m_dim != 1 && new_m_dim >= static_cast(optimal_m_dim); }; if (batch_dim_multiplier * needed_new_dim == m_dim) { diff --git a/src/inference/dev_api/openvino/runtime/make_tensor.hpp b/src/inference/dev_api/openvino/runtime/make_tensor.hpp index 434e72b5da5..e41ebd3688f 100644 --- a/src/inference/dev_api/openvino/runtime/make_tensor.hpp +++ b/src/inference/dev_api/openvino/runtime/make_tensor.hpp @@ -71,9 +71,9 @@ ov::SoPtr make_tensor(const std::shared_ptr& ten const InferenceEngine::Blob* get_hardware_blob(const InferenceEngine::Blob* blob); InferenceEngine::Blob* get_hardware_blob(InferenceEngine::Blob* blob); -std::shared_ptr tensor_to_blob(const ov::SoPtr& tensor, - bool unwrap = true, - InferenceEngine::TensorDesc desc = {}); +OPENVINO_RUNTIME_API std::shared_ptr tensor_to_blob(const ov::SoPtr& tensor, + bool unwrap = true, + InferenceEngine::TensorDesc desc = {}); /** @endcond */ IE_SUPPRESS_DEPRECATED_END diff --git a/src/inference/src/dev/isync_infer_request.cpp b/src/inference/src/dev/isync_infer_request.cpp index 1929e6f9d9d..347d4925cd9 100644 --- a/src/inference/src/dev/isync_infer_request.cpp +++ b/src/inference/src/dev/isync_infer_request.cpp @@ -260,8 +260,9 @@ void ov::ISyncInferRequest::check_tensor(const ov::Output& port, " expecting ", port.get_shape(), "."); - OPENVINO_ASSERT(std::dynamic_pointer_cast(tensor._ptr) || tensor->data() != nullptr, - "Tensor data equal nullptr!"); + OPENVINO_ASSERT( + std::dynamic_pointer_cast(tensor._ptr) || tensor->data() != nullptr || is_dynamic, + "Tensor data equal nullptr!"); } void ov::ISyncInferRequest::allocate_tensor( diff --git a/src/inference/src/ie_layouts.cpp b/src/inference/src/ie_layouts.cpp index 689a0d2af9c..6c2543b3605 100644 --- a/src/inference/src/ie_layouts.cpp +++ b/src/inference/src/ie_layouts.cpp @@ -298,7 +298,9 @@ BlockingDesc::BlockingDesc(const SizeVector& blocked_dims, this->offsetPaddingToData = dimOffsets; // check that strides are valid - { + if (!std::any_of(blocked_dims.begin(), blocked_dims.end(), [](const size_t dim) { + return dim == 0ul; + })) { size_t denseStride = 1; for (size_t i = 1; i <= strides.size(); i++) { diff --git a/src/plugins/intel_cpu/src/cpu_memory.h b/src/plugins/intel_cpu/src/cpu_memory.h index 856772c922d..373c14851f4 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.h +++ b/src/plugins/intel_cpu/src/cpu_memory.h @@ -32,6 +32,7 @@ namespace ov { namespace intel_cpu { class Memory; +class ProxyMemoryMngr; /** * @interface IMemoryMngr @@ -313,6 +314,7 @@ public: private: friend DnnlMemoryMngr; + friend ProxyMemoryMngr; private: void update(); diff --git a/src/plugins/intel_cpu/src/cpu_tensor.cpp b/src/plugins/intel_cpu/src/cpu_tensor.cpp new file mode 100644 index 00000000000..48d8fdd4be2 --- /dev/null +++ b/src/plugins/intel_cpu/src/cpu_tensor.cpp @@ -0,0 +1,98 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_tensor.h" +#include "ie_ngraph_utils.hpp" + +#include "utils/debug_capabilities.h" + +namespace ov { +namespace intel_cpu { + +Tensor::Tensor(MemoryPtr memptr) : m_memptr{memptr} { + OPENVINO_ASSERT(m_memptr != nullptr); + + // only support plain data format ncsp. + auto memdesc = m_memptr->getDescPtr(); + OPENVINO_ASSERT(memdesc->hasLayoutType(LayoutType::ncsp), "intel_cpu::Tensor only supports memory with ncsp layout."); + + m_element_type = InferenceEngine::details::convertPrecision(memdesc->getPrecision()); +} + +void Tensor::set_shape(ov::Shape new_shape) { + const auto& shape = m_memptr->getDescPtr()->getShape(); + if (shape.isStatic()) { + DEBUG_LOG("tensor's memory object ", m_memptr.get(), ", ", vec2str(shape.getStaticDims()), " -> ", new_shape.to_string()); + if (shape.getStaticDims() == new_shape) return; + } + + auto desc = m_memptr->getDescPtr(); + const auto newDesc = desc->cloneWithNewDims(new_shape, true); + m_memptr->redefineDesc(newDesc); +} + +const ov::element::Type& Tensor::get_element_type() const { + return m_element_type; +} + +const ov::Shape& Tensor::get_shape() const { + auto& shape = m_memptr->getDescPtr()->getShape(); + OPENVINO_ASSERT(shape.isStatic(), "intel_cpu::Tensor has dynamic shape."); + + std::lock_guard guard(m_lock); + m_shape = ov::Shape{shape.getStaticDims()}; + return m_shape; +} + +size_t Tensor::get_size() const { + auto& desc = m_memptr->getDesc(); + return desc.getShape().getElementsCount(); +} + +size_t Tensor::get_byte_size() const { + auto& desc = m_memptr->getDesc(); + return desc.getCurrentMemSize(); +} + +const ov::Strides& Tensor::get_strides() const { + OPENVINO_ASSERT(m_memptr->getDescPtr()->isDefined(), "intel_cpu::Tensor requires memory with defined strides."); + + std::lock_guard guard(m_lock); + update_strides(); + return m_strides; +} + +void Tensor::update_strides() const { + auto blocked_desc = m_memptr->getDescWithType(); + OPENVINO_ASSERT(blocked_desc, "not a valid blocked memory descriptor."); + auto& strides = blocked_desc->getStrides(); + m_strides.resize(strides.size()); + std::transform(strides.cbegin(), strides.cend(), m_strides.begin(), + std::bind1st(std::multiplies(), m_element_type.size())); +} + +void* Tensor::data(const element::Type& element_type) const { + if (element_type != element::undefined && element_type != element::dynamic) { + OPENVINO_ASSERT(element_type == get_element_type(), + "Tensor data with element type ", + get_element_type(), + ", is not representable as pointer to ", + element_type); + } + return m_memptr->getData(); +} + +/** + * @brief Creates tensor on graph memory + * + * @param mem Memory object + * + * @return Shared pointer to tensor interface + */ +std::shared_ptr make_tensor(MemoryPtr mem) { + return std::make_shared(mem); +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/cpu_tensor.h b/src/plugins/intel_cpu/src/cpu_tensor.h new file mode 100644 index 00000000000..79ae805a2ff --- /dev/null +++ b/src/plugins/intel_cpu/src/cpu_tensor.h @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/runtime/itensor.hpp" +#include "cpu_memory.h" + +namespace ov { +namespace intel_cpu { + +class Tensor : public ITensor { +public: + // Only plain data format is supported. + explicit Tensor(MemoryPtr memptr); + + void set_shape(ov::Shape shape) override; + + const ov::element::Type& get_element_type() const override; + + const ov::Shape& get_shape() const override; + + size_t get_size() const override; + + size_t get_byte_size() const override; + + const ov::Strides& get_strides() const override; + + void* data(const element::Type& type = {}) const override; + + MemoryPtr get_memory() {return m_memptr;} + +private: + void update_strides() const; + + MemoryPtr m_memptr; + + ov::element::Type m_element_type; + mutable ov::Shape m_shape; + mutable ov::Strides m_strides; + mutable std::mutex m_lock; +}; + +std::shared_ptr make_tensor(MemoryPtr mem); + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index b18af69ec03..4c9b79908d9 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -515,6 +515,13 @@ EdgePtr Edge::getBaseEdge(int look) { if (edge->inPlace() && edge != edgesForSamePort[0]) return edge; } } + + // Return the first output edge as the base if there is no inPlace consumers + // thus benefits zero-copy of outputs. + for (auto edge : edgesForSamePort) { + if (Type::Output == edge->getChild()->getType()) return edge; + } + return edgesForSamePort[0]; } diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 4af6f881f70..81c1440ece4 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -812,8 +812,34 @@ void Graph::AllocateWithReuse() { } if (!undefinedBoxes.empty()) { + // Use proxy memory manager for output edges + for (auto& box : undefinedBoxes) { + for (auto& edge : edge_clusters[box.id]) { + const auto child = edge->getChild(); + if (child->getType() == Type::Output && + edge->getStatus() == Edge::Status::NeedAllocation) { + auto proxyMemMngr = + std::make_shared(); + DEBUG_LOG("ProxyMemoryMngr ", proxyMemMngr, " ", this); + edge->allocate(proxyMemMngr); + + // Store the output memory managers. + // So that, the infer requests can be able to access them. + int count = 0; + for (auto &output : outputNodesMap) { + if (output.second == child) { + outputNodesMemMngrMap[output.first] = proxyMemMngr; + count++; + } + } + // sometimes there are unused output ports. + IE_ASSERT(count <= 1) << "cannot find output node. count " << count; + } + } + } + if (!syncNodesInds.empty()) { - //We have to extend the lifespan of thensors that are crossing a sync point border in order to save + //We have to extend the lifespan of tensors that are crossing a sync point border in order to save //the intermediate computation results from possible loss due to the tensor resize std::vector vecIntervals = {0}; for (const auto& item : syncNodesInds) { @@ -990,6 +1016,7 @@ void Graph::PushInputData(const std::string& name, const InferenceEngine::Blob:: } } +// suppose always being shared infer_request intel_cpu::Tensor to Graph if isDynamic. void Graph::PullOutputData(BlobMap &out) { if (!IsReady()) IE_THROW() << "Wrong state. Topology not ready."; @@ -1006,6 +1033,8 @@ void Graph::PullOutputData(BlobMap &out) { IE_THROW(Unexpected) << "The CPU plugin graph doesn't contain output node with name: \"" << name << "\""; } + DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast(out[name]->buffer())); + const auto actualDesc = MemoryDescUtils::convertToTensorDesc(intr_blob.getDesc()); auto &expectedDesc = ext_blob->getTensorDesc(); @@ -1029,7 +1058,12 @@ void Graph::PullOutputData(BlobMap &out) { if (expectedDesc.getLayout() == InferenceEngine::Layout::BLOCKED) { expectedDesc = TensorDesc(expectedDesc.getPrecision(), expectedDesc.getLayout()); } + DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast(out[name]->buffer()), + " dims ", PartialShape(out[name]->getTensorDesc().getDims()), " -> ", PartialShape(outDims), + ", intr ptr ", intr_blob.getData(), " , parentedge's memory object ", parentEdge->getMemoryPtr().get()); out[name]->setShape(outDims); + DEBUG_LOG(name, ", blob ", out[name], ", addr ", static_cast(out[name]->buffer()), + " dims ", PartialShape(out[name]->getTensorDesc().getDims()), ", intr ptr ", intr_blob.getData()); } // check for empty output blob @@ -1047,6 +1081,8 @@ void Graph::PullOutputData(BlobMap &out) { void *ext_blob_ptr = ext_blob->buffer(); void *intr_blob_ptr = intr_blob.getData(); + DEBUG_LOG(name, " @ ", intr_blob_ptr, " -> ", ext_blob_ptr, " zero-copy: ", intr_blob_ptr == ext_blob_ptr, " graph ", this, "\r\n"); + // That is the same memory. No need to copy if (ext_blob_ptr == intr_blob_ptr) continue; @@ -1313,13 +1349,12 @@ inline void Graph::ExecuteNode(const NodePtr& node, const dnnl::stream& stream) DUMP(node, getConfig().debugCaps, infer_count); OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, node->profiling.execute); - + DEBUG_LOG(*node); if (node->isDynamicNode()) { node->executeDynamic(stream); } else { node->execute(stream); } - DEBUG_LOG(*node); } void Graph::Infer(InferRequestBase* request) { diff --git a/src/plugins/intel_cpu/src/graph.h b/src/plugins/intel_cpu/src/graph.h index f2b9cae7ecd..b94db195c10 100644 --- a/src/plugins/intel_cpu/src/graph.h +++ b/src/plugins/intel_cpu/src/graph.h @@ -19,6 +19,8 @@ #include #include +#include "proxy_mem_mgr.h" + namespace ov { namespace intel_cpu { @@ -190,6 +192,8 @@ public: return graphHasDynamicInput; } + Status getStatus() const {return status;} + protected: void VisitNode(NodePtr node, std::vector& sortedNodes); @@ -248,6 +252,8 @@ private: std::map inputNodesMap; std::map outputNodesMap; + std::unordered_map outputNodesMemMngrMap; + // these node pointers (from graphNodes) are to avoid regular checking for // constantness of nodes in Infer methods and calls of // non-executable (optimized out) nodes, such as Input, Reshape, etc. diff --git a/src/plugins/intel_cpu/src/infer_request.cpp b/src/plugins/intel_cpu/src/infer_request.cpp index fc54c77a299..f2aacbd3db4 100644 --- a/src/plugins/intel_cpu/src/infer_request.cpp +++ b/src/plugins/intel_cpu/src/infer_request.cpp @@ -25,6 +25,9 @@ #include "memory_desc/dnnl_blocked_memory_desc.h" #include #include +#include "proxy_mem_mgr.h" +#include "openvino/runtime/make_tensor.hpp" +#include namespace ov { namespace intel_cpu { @@ -182,6 +185,13 @@ void InferRequestBase::InferImpl() { ThrowIfCanceled(); + // update output control blocks, if any, in order to refresh internal buffers + if (Graph::Status::ReadyDynamic == graph->getStatus()) { + for (auto&& item : outputControlBlocks) { + item.second.update(); + } + } + graph->PullOutputData(_outputs); } @@ -202,93 +212,137 @@ static inline void changeEdgePtr(const EdgePtr &edge, InferenceEngine::Blob::Ptr } void InferRequestBase::changeDefaultPtr() { + const auto& inputNodesMap = graph->GetInputNodesMap(); + const auto& outputNodesMap = graph->GetOutputNodesMap(); for (auto& it : externalPtr) { - const auto& inputNodesMap = graph->GetInputNodesMap(); auto input = inputNodesMap.find(it.first); - if (input != inputNodesMap.end()) { - NodePtr inputNodePtr = input->second; - if (inputNodePtr->getChildEdgeAt(0)->getMemory().getData() == static_cast(it.second->buffer())) - continue; - auto& childEdges = inputNodePtr->getChildEdges(); - // Perform checks that the user's memory will not be modified - bool canBeInPlace = true; - for (auto& childEdge : childEdges) { - auto ce = childEdge.lock(); - if (!ce) + if (inputNodesMap.end() == input) { + OPENVINO_ASSERT(outputNodesMap.count(it.first), "Cannot find input/output blob: ", it.first); + continue; + } + NodePtr inputNodePtr = input->second; + if (inputNodePtr->getChildEdgeAt(0)->getMemory().getData() == static_cast(it.second->buffer())) + continue; + auto& childEdges = inputNodePtr->getChildEdges(); + // Perform checks that the user's memory will not be modified + bool canBeInPlace = true; + for (auto& childEdge : childEdges) { + auto ce = childEdge.lock(); + if (!ce) + IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge"; + + auto& child = ce->getChild(); + + if (child->isConstant()) { + canBeInPlace = false; + break; + } + + // the input memory should be referenced by the children, otherwise it should be written to a + // specific location + if (ce->inPlace(Edge::LOOK_DOWN)) { + canBeInPlace = false; + break; + } + + if (auto result = ce->modifiedInPlace()) { + canBeInPlace = false; + break; + } + + if (child->getType() == Type::Concatenation && child->isInPlace()) { + canBeInPlace = false; + break; + } + } + if (canBeInPlace) { + for (auto& edge : childEdges) { + auto e = edge.lock(); + if (!e) IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge"; - auto& child = ce->getChild(); - - if (child->isConstant()) { - canBeInPlace = false; - break; - } - - // the input memory should be referenced by the children, otherwise it should be written to a - // specific location - if (ce->inPlace(Edge::LOOK_DOWN)) { - canBeInPlace = false; - break; - } - - if (auto result = ce->modifiedInPlace()) { - canBeInPlace = false; - break; - } - - if (child->getType() == Type::Concatenation && child->isInPlace()) { - canBeInPlace = false; - break; - } + changeEdgePtr(e, it.second); } - if (canBeInPlace) { - for (auto& edge : childEdges) { - auto e = edge.lock(); - if (!e) - IE_THROW() << "Node " << inputNodePtr->getName() << " contains empty child edge"; + } + } - changeEdgePtr(e, it.second); - } - } + for (auto& it : externalPtr) { + const auto& name = it.first; + auto output = outputNodesMap.find(name); + if (outputNodesMap.end() == output) { continue; } + auto parentEdge = output->second->getParentEdgeAt(0); - const auto& outputNodesMap = graph->GetOutputNodesMap(); - auto output = outputNodesMap.find(it.first); - if (output != outputNodesMap.end()) { - auto parentEdge = output->second->getParentEdgeAt(0); - if (parentEdge->getMemory().getData() == static_cast(it.second->buffer())) - continue; + if (parentEdge->getMemory().getData() == static_cast(it.second->buffer())) + continue; - bool canBeInPlace = true; - void* defaultPtr = parentEdge->getMemory().getData(); - // Cannot be in-place after concat because concat is using different ptrs without offsets - auto parent = parentEdge->getParent(); - NodePtr previousParent; - do { - previousParent = parent; - if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInPlace()) { - canBeInPlace = false; + bool canBeInPlace = true; + void* defaultPtr = parentEdge->getMemory().getData(); + // Cannot be in-place after concat because concat is using different ptrs without offsets + auto parent = parentEdge->getParent(); + NodePtr previousParent; + do { + previousParent = parent; + if (parent->getChildEdges().size() != 1 || parent->isConstant() || parent->isInPlace()) { + canBeInPlace = false; + break; + } + + auto& parentEdges = parent->getParentEdges(); + for (auto& edge : parentEdges) { + auto e = edge.lock(); + if (!e) + IE_THROW() << "Node " << parent->getName() << " contains empty parent edge"; + + if (e->getMemory().getData() == defaultPtr) { + parent = e->getParent(); break; } + } + } while (previousParent != parent); + if (canBeInPlace) + changeEdgePtr(parentEdge, it.second); + } - auto& parentEdges = parent->getParentEdges(); - for (auto& edge : parentEdges) { - auto e = edge.lock(); - if (!e) - IE_THROW() << "Node " << parent->getName() << " contains empty parent edge"; + if (Graph::Status::ReadyDynamic == graph->getStatus()) { + const auto &outMemMngrMap = graph->outputNodesMemMngrMap; + for (auto&& item : outMemMngrMap) { + const auto& name = item.first; - if (e->getMemory().getData() == defaultPtr) { - parent = e->getParent(); - break; + // share intel_cpu::Tensor to Graph by injecting to corresponding ProxyMemoryMngr instance. + auto outputMemMngr = item.second; + OPENVINO_ASSERT(outputMemMngr, "proxy mem manager for output ", name, " is empty."); + + auto controlBlockItr = outputControlBlocks.find(name); + + if (controlBlockItr != outputControlBlocks.end()) { + auto output = outputNodesMap.find(name); + OPENVINO_ASSERT(outputNodesMap.end() != output, "Node with name: ", name, " is absent in the outputNodesMap"); + auto parentEdge = output->second->getParentEdgeAt(0); + //avoid cyclic memory use + auto parentNode = parentEdge->getParent(); + const auto& parentNodeInpEdges = parentNode->getParentEdges(); + std::unordered_set parentInputPtrs(parentNodeInpEdges.size()); + for (auto&& edge : parentNodeInpEdges) { + if (auto edgePtr = edge.lock()) { + parentInputPtrs.insert(edgePtr->getMemoryPtr()->getData()); } } - } while (previousParent != parent); - if (canBeInPlace) - changeEdgePtr(parentEdge, it.second); - continue; + + auto&& controlBlock = controlBlockItr->second; + + std::shared_ptr memMngr = parentInputPtrs.count(controlBlock.rawPtr()) ? // same memory is used on the input and output + controlBlock.nextMemMngr() : // then swap internal buffer to avoid data corruption + controlBlock.currentMemMngr(); // else reuse the existing buffer + + outputMemMngr->setMemMngr(memMngr); + DEBUG_LOG("reset proxy ", outputMemMngr, ", actual ", controlBlock.currentMemMngr(), " graph ", graph, " inferrequest ", this); + DEBUG_LOG(name, ", blob ", controlBlock.blob(), ", tensor ", controlBlock.tensor()); + } else { + outputMemMngr->reset(); // switch to the internal memory since memory sharing is no longer possible + } } - IE_THROW() << "Cannot find input/output blob: " << it.first; } } @@ -716,6 +770,7 @@ void InferRequest::SetBlob(const std::string& name, const InferenceEngine::Blob: externalPtr.erase(name); } _outputs[name] = data; + outputControlBlocks.erase(name); // now the memory is under user's control } } @@ -774,22 +829,39 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) { if (_outputs.find(name) == _outputs.end()) { auto outputNode = modelOutputsMap.find(name); if (modelOutputsMap.find(name) != modelOutputsMap.end()) { - const auto shape = outputNode->second->get_input_partial_shape(0); - bool isDynamic = shape.is_dynamic(); + const auto& model_shape = outputNode->second->get_input_partial_shape(0); + const auto& graph_shape = output->second->getInputShapeAtPort(0); + + // WA, due to the transformations and constant folding, shape inference of the resulting model may + // have static shapes, while they are dynamic in the initial representation + const auto& shape = graph_shape.isDynamic() ? model_shape : + (model_shape.is_dynamic() ? graph_shape.toPartialShape() : model_shape); + + const bool isDynamic = shape.is_dynamic(); if (!data) { InferenceEngine::SizeVector dims; if (isDynamic) { - dims = InferenceEngine::SizeVector(shape.rank().get_length(), 0); + const auto model_prec = InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)); + const auto graph_prec = output->second->getParentEdgesAtPort(0)[0]->getMemory().getDesc().getPrecision(); + OutputControlBlock control_block{model_prec, Shape{shape}}; + + DEBUG_LOG(name, + ", blob ", control_block.blob(), + ", tensor ", control_block.tensor(), + ", memmngr ", control_block.tensor()->get_memory()->getMemoryMngr(), + "memory object ", control_block.tensor()->get_memory().get()); + + data = control_block.blob(); + if (model_prec == graph_prec) outputControlBlocks.emplace(std::make_pair(name, std::move(control_block))); } else { dims = shape.to_shape(); + + InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)), + dims, InferenceEngine::TensorDesc::getLayoutByRank(dims.size())); + data = make_blob_with_precision(desc); + data->allocate(); } - - InferenceEngine::TensorDesc desc(InferenceEngine::details::convertPrecision(outputNode->second->get_input_element_type(0)), - dims, InferenceEngine::TensorDesc::getLayoutByRank(dims.size())); - - data = make_blob_with_precision(desc); - data->allocate(); } else { const auto& blobDims = data->getTensorDesc().getDims(); // in static shape case is enough information that shapes are incompatible to throw exception @@ -831,9 +903,23 @@ InferenceEngine::Blob::Ptr InferRequest::GetBlob(const std::string& name) { IE_THROW() << "Cannot find blob with name: " << name; } + DEBUG_LOG(name, ", blob ", data, ", ", static_cast(data->buffer())); return data; } +void InferRequest::checkBlobs() { + for (auto const& input : _inputs) { + checkBlob(input.second, input.first, true); + } + + // won't check dynamic output blobs as they are not allocated. + for (auto const& output : _outputs) { + const auto out_node = findOutputByNodeName(output.first); + const auto isDynamic = out_node && out_node->get_output_partial_shape(0).is_dynamic(); + if (!isDynamic) checkBlob(output.second, output.first, false); + } +} + void InferRequest::PushInputData() { for (auto input : _inputs) { auto inputName = input.first; @@ -845,5 +931,22 @@ void InferRequest::PushInputData() { } } +InferRequestBase::OutputControlBlock::OutputControlBlock(const InferenceEngine::Precision& precision, const Shape& shape) { + dnnl::engine eng(dnnl::engine::kind::cpu, 0); + m_buffers[m_buffIndx] = std::make_shared(); + m_proxyMemMngr = std::make_shared(m_buffers[m_buffIndx]); + + Shape memShape = shape.isDynamic() ? + Shape{VectorDims(shape.getRank(), 0)} : // this is a WA since the ITensor doesn't allow dyn shapes + Shape{shape}; + + CpuBlockedMemoryDescPtr desc = + std::make_shared(precision, memShape); + + auto memory = std::make_shared(eng, desc, m_proxyMemMngr); + m_tensor = std::make_shared(memory); + m_blob = tensor_to_blob({m_tensor, nullptr}); +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/infer_request.h b/src/plugins/intel_cpu/src/infer_request.h index dc1b34a9f4e..333b6b7cca6 100644 --- a/src/plugins/intel_cpu/src/infer_request.h +++ b/src/plugins/intel_cpu/src/infer_request.h @@ -9,6 +9,7 @@ #include #include #include +#include "cpu_tensor.h" namespace ov { namespace intel_cpu { @@ -52,12 +53,65 @@ protected: InferenceEngine::Precision normToInputSupportedPrec(const std::pair& input) const; void pushInput(const std::string& inputName, InferenceEngine::Blob::Ptr& inputBlob, InferenceEngine::Precision dataType); +protected: + class OutputControlBlock { + public: + using MemMngrPtr = std::shared_ptr; + + public: + OutputControlBlock(const InferenceEngine::Precision& precision, const Shape& shape); + + OutputControlBlock(const OutputControlBlock&) = delete; + OutputControlBlock& operator=(const OutputControlBlock&) = delete; + + OutputControlBlock(OutputControlBlock&&) = default; + OutputControlBlock& operator=(OutputControlBlock&&) = default; + + InferenceEngine::Blob::Ptr blob() const { + return m_blob; + } + + std::shared_ptr tensor() const { + return m_tensor; + } + + const void* rawPtr() const { + return m_tensor->get_memory()->getData(); + } + + MemMngrPtr currentMemMngr() const { + return m_buffers[m_buffIndx]; + } + + MemMngrPtr nextMemMngr() { + m_buffIndx ^= 0x1; + if (!m_buffers[m_buffIndx]) { + m_buffers[m_buffIndx] = std::make_shared(); + } + return m_buffers[m_buffIndx]; + } + + void update() { + m_proxyMemMngr->setMemMngr(currentMemMngr()); + } + + private: + std::shared_ptr m_tensor = nullptr; + InferenceEngine::Blob::Ptr m_blob = nullptr; + ProxyMemoryMngrPtr m_proxyMemMngr = nullptr; + std::array m_buffers; + int m_buffIndx = 0; + }; + +protected: virtual void initBlobs() = 0; virtual void PushInputData() = 0; Graph* graph = nullptr; std::unordered_map externalPtr; + std::unordered_map outputControlBlocks; + private: void PushStates(); void PullStates(); @@ -97,6 +151,8 @@ public: void SetBlobsImpl(const std::string& name, const InferenceEngine::BatchedBlob::Ptr& batched_blob) override; InferenceEngine::Blob::Ptr GetBlob(const std::string& name) override; + void checkBlobs() override; + private: void PushInputData() override; void initBlobs() override; diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index f8a9de782c2..bb1be492515 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -338,9 +338,6 @@ void Reorder::execute(dnnl::stream strm) { } else if (canUseNcsp2Nspc) { optimizedNcsp2Nspc(); } else { - // src_blocked->setDataHandle(getParentEdgeAt(0)->getMemory().GetData()); - // dst_blocked->setDataHandle(getChildEdgeAt(0)->getMemory().GetData()); - if (prim) { prim.execute(strm, primArgs); } else { diff --git a/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp b/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp new file mode 100644 index 00000000000..4a39fca50e0 --- /dev/null +++ b/src/plugins/intel_cpu/src/proxy_mem_mgr.cpp @@ -0,0 +1,74 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "proxy_mem_mgr.h" +#include "utils/debug_capabilities.h" + +using namespace ov::intel_cpu; + +void ProxyMemoryMngr::setMemMngr(std::shared_ptr pMngr) { + OPENVINO_ASSERT(pMngr, "Attempt to set null memory manager to a ProxyMemoryMngr object"); + if (m_pMngr == pMngr) { + return; + } + + m_pMngr = pMngr; + m_pMngr->resize(m_size); + notifyUpdate(); +} + +void ProxyMemoryMngr::reset() { + if (!m_pOrigMngr) { + m_pOrigMngr = std::make_shared(); + } + + if (m_pMngr == m_pOrigMngr) { + return; + } + + m_pMngr = m_pOrigMngr; + m_pMngr->resize(m_size); + notifyUpdate(); +} + +void* ProxyMemoryMngr::getRawPtr() const noexcept { + return m_pMngr->getRawPtr(); +} + +void ProxyMemoryMngr::setExtBuff(void* ptr, size_t size) { + m_pMngr->setExtBuff(ptr, size); + notifyUpdate(); +} + +bool ProxyMemoryMngr::resize(size_t size) { + auto res = m_pMngr->resize(size); + DEBUG_LOG(this, ", ", m_pMngr, " size ", m_size, " -> ", size, " resized? ", res, " RawPtr ", getRawPtr()); + m_size = size; + notifyUpdate(); + return res; +} + +bool ProxyMemoryMngr::hasExtBuffer() const noexcept { + return m_pMngr->hasExtBuffer(); +} + +void ProxyMemoryMngr::registerMemory(Memory* memPtr) { + if (memPtr) { + m_setMemPtrs.insert(memPtr); + } +} + +void ProxyMemoryMngr::unregisterMemory(Memory* memPtr) { + if (memPtr) { + m_setMemPtrs.erase(memPtr); + } +} + +void ProxyMemoryMngr::notifyUpdate() { + for (auto& item : m_setMemPtrs) { + if (item) { + item->update(); + } + } +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/proxy_mem_mgr.h b/src/plugins/intel_cpu/src/proxy_mem_mgr.h new file mode 100644 index 00000000000..ec0c8e108ec --- /dev/null +++ b/src/plugins/intel_cpu/src/proxy_mem_mgr.h @@ -0,0 +1,52 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_memory.h" + +namespace ov { +namespace intel_cpu { + +/** + * @brief A proxy object that additionally implements observer pattern + */ +class ProxyMemoryMngr : public IMemoryMngrObserver { +public: + ProxyMemoryMngr() : m_pOrigMngr(std::make_shared()), m_pMngr(m_pOrigMngr) {} + explicit ProxyMemoryMngr(std::shared_ptr pMngr) { + OPENVINO_ASSERT(pMngr, "Memory manager is uninitialized"); + m_pMngr = pMngr; + } + + void* getRawPtr() const noexcept override; + void setExtBuff(void* ptr, size_t size) override; + bool resize(size_t size) override; + bool hasExtBuffer() const noexcept override; + + void registerMemory(Memory* memPtr) override; + void unregisterMemory(Memory* memPtr) override; + + void setMemMngr(std::shared_ptr pMngr); + void reset(); + +private: + void notifyUpdate(); + + // We keep the original MemMngr as may fallback to copy output. + std::shared_ptr m_pOrigMngr = nullptr; + std::shared_ptr m_pMngr = nullptr; + + std::unordered_set m_setMemPtrs; + + // WA: resize stage might not work because there is no shape change, + // but the underlying actual memory manager changes. + size_t m_size = 0ul; +}; + +using ProxyMemoryMngrPtr = std::shared_ptr; +using ProxyMemoryMngrCPtr = std::shared_ptr; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp index a7992136e9b..3a68057af24 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_blocking.cpp @@ -29,7 +29,7 @@ bool BrgemmBlocking::run(snippets::lowered::LinearIR& linear_ir) { const auto& loop_manager = linear_ir.get_loop_manager(); - const auto dim_idx = 1; + const size_t dim_idx = 1; auto blocking_loop_exists = [&](const ov::snippets::lowered::ExpressionPtr& expr, const std::shared_ptr& brgemm) { diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp index 300f62e27b1..6c37edf61f9 100644 --- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp +++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp @@ -254,7 +254,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) { } else { // no SPD yet, use orginal shapes comma = ""; - for (int i = 0; i < node.getOriginalOutputPrecisions().size(); i++) { + for (size_t i = 0; i < node.getOriginalOutputPrecisions().size(); i++) { auto shape = node.getOutputShapeAtPort(i); std::string prec_name = "Undef"; prec_name = node.getOriginalOutputPrecisionAtPort(i).name(); @@ -282,6 +282,10 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) { auto n = edge->getParent(); os << comma; os << node_id(*edge->getParent()); + auto ptr = edge->getMemoryPtr(); + if (ptr) { + os << "_" << ptr->getData(); + } if (!is_single_output_port(*n)) os << "[" << edge->getInputNum() << "]"; comma = ","; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp new file mode 100644 index 00000000000..34ebc30203b --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_infer_request/iteration_chaining.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include +#include "behavior/ov_infer_request/iteration_chaining.hpp" +#include "common_test_utils/test_constants.hpp" + +using namespace ov::test::behavior; + +namespace { + +const std::vector configs = { + {} +}; + +const std::vector HeteroConfigs = { + {ov::device::priorities(CommonTestUtils::DEVICE_CPU)} +}; + +const std::vector AutoConfigs = { + {ov::device::priorities(CommonTestUtils::DEVICE_CPU)} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVIterationChaining, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::ValuesIn(configs)), + OVIterationChaining::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Hetero_BehaviorTests, OVIterationChaining, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_HETERO), + ::testing::ValuesIn(HeteroConfigs)), + OVIterationChaining::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVIterationChaining, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(AutoConfigs)), + OVIterationChaining::getTestCaseName); + +} // namespace diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp index 3680d5359cf..9242229d056 100644 --- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp +++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/custom_op_internal_dyn.cpp @@ -90,7 +90,8 @@ protected: auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(inputParams)); auto customOp = std::make_shared(paramOuts); - ngraph::ResultVector results{std::make_shared(customOp)}; + ngraph::ResultVector results{std::make_shared(customOp->output(0)), + std::make_shared(customOp->output(1))}; function = std::make_shared(results, inputParams, "customOpTest"); } diff --git a/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp new file mode 100644 index 00000000000..a522f48cf30 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp @@ -0,0 +1,258 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include +#include +#include +#include "openvino/core/except.hpp" +#include "openvino/core/partial_shape.hpp" + +#include "cpu_memory.h" +#include "cpu_tensor.h" +#include "openvino/runtime/itensor.hpp" + +#include "ie_ngraph_utils.hpp" + +using namespace ov::intel_cpu; +using namespace InferenceEngine; + +using CPUTensorTest = ::testing::Test; + +class MockBlockedMemoryDesc : public BlockedMemoryDesc { +public: + MockBlockedMemoryDesc(const Shape& _shape) : MemoryDesc(_shape, Blocked) {} + + MOCK_METHOD(InferenceEngine::Precision, getPrecision, (), (const, override)); + MOCK_METHOD(MemoryDescPtr, clone, (), (const, override)); + MOCK_METHOD(size_t, getOffsetPadding, (), (const, override)); + + MOCK_METHOD(MemoryDescPtr, cloneWithNewDimsImp, (const VectorDims&), (const, override)); + + MOCK_METHOD(MemoryDescPtr, cloneWithNewPrecision, (const InferenceEngine::Precision), (const, override)); + MOCK_METHOD(bool, isCompatible, (const MemoryDesc&), (const, override)); + + MOCK_METHOD(bool, hasLayoutType, (LayoutType), (const, override)); + + MOCK_METHOD(size_t, getMaxMemSize, (), (const, override)); + + MOCK_METHOD(const VectorDims&, getBlockDims, (), (const, override)); + MOCK_METHOD(const VectorDims&, getOrder, (), (const, override)); + MOCK_METHOD(const VectorDims&, getOffsetPaddingToData, (), (const, override)); + MOCK_METHOD(const VectorDims&, getStrides, (), (const, override)); + MOCK_METHOD(bool, blocksExtended, (), (const, override)); + MOCK_METHOD(size_t, getPaddedElementsCount, (), (const, override)); + MOCK_METHOD(bool, isCompatible, (const BlockedMemoryDesc &, CmpMask), (const, override)); + + MOCK_METHOD(void, setPrecision, (InferenceEngine::Precision), (override)); + + MOCK_METHOD(size_t, getCurrentMemSizeImp, (), (const, override)); + + MOCK_METHOD(size_t, getElementOffset, (size_t), (const, override)); + MOCK_METHOD(bool, canComputeMemSizeZeroDims, (), (const, override)); + MOCK_METHOD(bool, isDefinedImp, (), (const, override)); +}; + +class MockIMemory : public IMemory { +public: + MockIMemory(MemoryDescPtr desc) : m_pMemDesc(desc) {} + MockIMemory(const MemoryDesc& desc) : m_pMemDesc(desc.clone()) {} + + MOCK_METHOD(bool, isAllocated, (), (const, noexcept, override)); + MOCK_METHOD(MemoryDesc&, getDesc, (), (const, override)); + MOCK_METHOD(MemoryDescPtr, getDescPtr, (), (const, override)); + + MOCK_METHOD(size_t, getSize, (), (const, override)); + MOCK_METHOD(const Shape&, getShape, (), (const, override)); + MOCK_METHOD(const VectorDims&, getStaticDims, (), (const, override)); + + MOCK_METHOD(void, redefineDesc, (MemoryDescPtr), (override)); + MOCK_METHOD(void, load, (const IMemory&, bool), (const, override)); + MOCK_METHOD(MemoryMngrPtr, getMemoryMngr, (), (const, override)); + + MOCK_METHOD(dnnl::memory, getPrimitive, (), (const, override)); + MOCK_METHOD(void, nullify, (), (override)); + MOCK_METHOD(void*, getData, (), (const, override)); + + void set_memDesc(MemoryDescPtr memdesc) { m_pMemDesc = memdesc; } + void set_memDesc(const MemoryDesc& memdesc) { m_pMemDesc = memdesc.clone(); } + MemoryDesc& get_memDesc() const { return *m_pMemDesc; } + MemoryDescPtr get_memDescPtr() { return m_pMemDesc; } + +private: + MemoryDescPtr m_pMemDesc; +}; + +// helper to get byte strides from strides. +static ov::Strides byte_strides(const ov::Strides& strides, const ov::element::Type& type) { + ov::Strides byte_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + byte_strides[i] = strides[i] * type.size(); + return byte_strides; +} + +// helper to create Memory of ncsp layout. +inline MemoryDescPtr create_memdesc(Precision prec, const Shape& shape, const VectorDims& strides = {}) { + ov::Shape ov_shape = shape.toPartialShape().to_shape(); + const std::size_t totalSize = ov::shape_size(ov_shape); + auto elem_type = InferenceEngine::details::convertPrecision(prec); + + auto memdesc = std::make_shared(shape); + ::testing::Mock::AllowLeak(memdesc.get()); + + EXPECT_CALL(*memdesc, hasLayoutType(::testing::Eq(LayoutType::ncsp))).WillRepeatedly(::testing::Return(true)); + + EXPECT_CALL(*memdesc, getPrecision).WillRepeatedly(::testing::Return(prec)); + EXPECT_CALL(*memdesc, getStrides).WillRepeatedly(::testing::ReturnRef(strides)); + + EXPECT_CALL(*memdesc, canComputeMemSizeZeroDims).WillRepeatedly(::testing::Return(true)); + EXPECT_CALL(*memdesc, isDefinedImp).WillRepeatedly(::testing::Return(true)); + EXPECT_CALL(*memdesc, getCurrentMemSizeImp).WillRepeatedly(::testing::Return(totalSize * elem_type.size())); + + return memdesc; +} + +inline MemoryPtr create_memory(MemoryDescPtr memdesc) { + auto memptr = std::make_shared(memdesc); + ::testing::Mock::AllowLeak(memptr.get()); + + // getDesc + EXPECT_CALL(*memptr, getDescPtr) + .Times(::testing::AnyNumber()) + .WillRepeatedly([memptr]() { + return memptr->get_memDescPtr(); + }); + EXPECT_CALL(*memptr, getDesc).WillRepeatedly(::testing::ReturnRef(memptr->get_memDesc())); + + // data + static size_t memSize = 0; + EXPECT_CALL(*memptr, getData) + .WillRepeatedly([memptr]() { + auto memdesc = memptr->get_memDescPtr(); + auto required = memdesc->getCurrentMemSize(); + if (memSize >= required) { + return reinterpret_cast(memSize); + } else { + memSize = required; + return reinterpret_cast(required); + } + }); + + // redefineDesc + ON_CALL(*memptr, redefineDesc).WillByDefault([memptr](MemoryDescPtr desc) { + memptr->set_memDesc(desc); + }); + EXPECT_CALL(*memptr, redefineDesc).Times(::testing::AtLeast(1)); + + return memptr; +} + +TEST_F(CPUTensorTest, canCreateTensor) { + Shape shape{4, 3, 2}; + ov::Shape ov_shape = shape.toPartialShape().to_shape(); + auto strides = ov::Strides({6, 2, 1}); + const std::size_t totalSize = ov::shape_size(ov_shape); + ov::element::Type elem_type = ov::element::f32; + + auto memptr = create_memory(create_memdesc(Precision::FP32, shape, strides)); + { + std::shared_ptr t = std::make_shared(memptr); + ASSERT_EQ(totalSize, t->get_size()); + ASSERT_NE(nullptr, t->data()); + ASSERT_EQ(elem_type, t->get_element_type()); + ASSERT_EQ(ov_shape, t->get_shape()); + ASSERT_NE(ov_shape, t->get_strides()); + ASSERT_EQ(byte_strides(ov::Strides({6, 2, 1}), t->get_element_type()), t->get_strides()); + ASSERT_EQ(elem_type.size() * totalSize, t->get_byte_size()); + ASSERT_THROW(t->data(ov::element::i64), ov::Exception); + ASSERT_THROW(t->data(), ov::Exception); + } +} + +TEST_F(CPUTensorTest, canAccessF16Tensor) { + Shape shape = {4, 3, 2}; + auto strides = ov::Strides({6, 2, 1}); + + auto memptr = create_memory(create_memdesc(Precision::FP16, shape, strides)); + { + std::shared_ptr t = std::make_shared(memptr); + EXPECT_NE(nullptr, t->data()); + ASSERT_EQ(ov::element::f16, t->get_element_type()); + EXPECT_NO_THROW(t->data(ov::element::f16)); + EXPECT_NO_THROW(t->data()); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); + } +} + +// SetShape +TEST_F(CPUTensorTest, canSetShape) { + const Shape origShape = {1, 2, 3}; + const ov::Shape ov_origShape = origShape.toPartialShape().to_shape(); + auto strides = ov::Strides({6, 3, 1}); + auto memdesc = create_memdesc(Precision::FP32, origShape, strides); + auto memptr = create_memory(memdesc); + std::shared_ptr t = std::make_shared(memptr); + + const Shape newShape({4, 5, 6}); + const ov::Shape ov_newShape = newShape.toPartialShape().to_shape(); + auto new_strides = ov::Strides{30, 6, 1}; + auto new_memdesc = create_memdesc(Precision::FP32, newShape, new_strides); + + // set_shape to a bigger memory + { + auto blocked_memdesc = dynamic_cast(memdesc.get()); + EXPECT_CALL(*blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(new_memdesc)); + + const void* orig_data = t->data(); + ASSERT_EQ(t->get_shape(), ov_origShape); + ASSERT_NO_THROW(t->set_shape(ov_newShape)); + ASSERT_EQ(ov_newShape, t->get_shape()); + ASSERT_EQ(byte_strides(ov::row_major_strides(ov_newShape), t->get_element_type()), t->get_strides()); + ASSERT_NE(orig_data, t->data()); + } + + // set_shape for smaller memory - does not perform reallocation + { + auto new_blocked_memdesc = dynamic_cast(new_memdesc.get()); + EXPECT_CALL(*new_blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(memdesc)); + const void* orig_data = t->data(); + t->set_shape(ov_origShape); + ASSERT_EQ(ov_origShape, t->get_shape()); + ASSERT_EQ(orig_data, t->data()); + } +} + +TEST_F(CPUTensorTest, canSyncMemoryAndTensor) { + const Shape origShape = {1, 2, 3}; + const ov::Shape ov_origShape = origShape.toPartialShape().to_shape(); + auto strides = ov::Strides({6, 3, 1}); + auto memdesc = create_memdesc(Precision::FP32, origShape, strides); + auto memptr = create_memory(memdesc); + std::shared_ptr t = std::make_shared(memptr); + + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byte_strides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + + const Shape newShape({4, 5, 6}); + const ov::Shape ov_newShape = newShape.toPartialShape().to_shape(); + auto new_strides = ov::Strides{30, 6, 1}; + auto new_memdesc = create_memdesc(Precision::FP32, newShape, new_strides); + + // reallocate memory out boundary of tensor instance + { + auto blocked_memdesc = dynamic_cast(memdesc.get()); + EXPECT_CALL(*blocked_memdesc, cloneWithNewDimsImp).WillRepeatedly(::testing::Return(new_memdesc)); + + auto desc2 = memptr->getDescPtr()->cloneWithNewDims(newShape.getStaticDims(), true); + memptr->redefineDesc(desc2); + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byte_strides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + } +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp new file mode 100644 index 00000000000..be8b6a05598 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/cpu_tensor_test_ext.cpp @@ -0,0 +1,156 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include +#include +#include +#include "openvino/core/except.hpp" +#include "openvino/core/partial_shape.hpp" + +#include "cpu_memory.h" +#include "cpu_tensor.h" +#include "openvino/runtime/itensor.hpp" + +using namespace ov::intel_cpu; +using namespace InferenceEngine; + +using CPUTensorExtTest = ::testing::Test; + +static ov::Strides byteStrides(const ov::Strides& strides, const ov::element::Type& type) { + ov::Strides byte_strides(strides.size()); + for (size_t i = 0; i < strides.size(); ++i) + byte_strides[i] = strides[i] * type.size(); + return byte_strides; +} + +inline MemoryPtr create_memory(Precision prc, const Shape& shape) { + dnnl::engine eng(dnnl::engine::kind::cpu, 0); + CpuBlockedMemoryDescPtr desc; + desc = std::make_shared(prc, shape); + return std::make_shared(eng, desc); +} + +TEST_F(CPUTensorExtTest, canCreateTensor) { + Shape shape{4, 3, 2}; + ov::Shape ov_shape = shape.toPartialShape().to_shape(); + + std::shared_ptr t = std::make_shared(create_memory(Precision::FP32, shape)); + const std::size_t totalSize = ov::shape_size(ov_shape); + ASSERT_EQ(totalSize, t->get_size()); + ASSERT_NE(nullptr, t->data()); + ASSERT_EQ(ov::element::f32, t->get_element_type()); + ASSERT_EQ(ov_shape, t->get_shape()); + ASSERT_NE(ov_shape, t->get_strides()); + ASSERT_EQ(byteStrides(ov::Strides({6, 2, 1}), t->get_element_type()), t->get_strides()); + ASSERT_EQ(ov::element::f32.size() * totalSize, t->get_byte_size()); + ASSERT_THROW(t->data(ov::element::i64), ov::Exception); + ASSERT_THROW(t->data(), ov::Exception); +} + +TEST_F(CPUTensorExtTest, canAccessF16Tensor) { + Shape shape = {4, 3, 2}; + std::shared_ptr t = std::make_shared(create_memory(Precision::FP16, shape)); + EXPECT_NE(nullptr, t->data()); + ASSERT_EQ(ov::element::f16, t->get_element_type()); + EXPECT_NO_THROW(t->data(ov::element::f16)); + EXPECT_NO_THROW(t->data()); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); + EXPECT_THROW(t->data(), ov::Exception); +} + +// SetShape +TEST_F(CPUTensorExtTest, canSetShape) { + const ov::Shape origShape({1, 2, 3}); + std::shared_ptr t = std::make_shared(create_memory(Precision::FP32, {1, 2, 3})); + const ov::Shape newShape({4, 5, 6}); + + const void* orig_data = t->data(); + ASSERT_EQ(t->get_shape(), origShape); + ASSERT_NO_THROW(t->set_shape({4, 5, 6})); + ASSERT_EQ(newShape, t->get_shape()); + ASSERT_EQ(byteStrides(ov::row_major_strides(newShape), t->get_element_type()), t->get_strides()); + ASSERT_NE(orig_data, t->data()); + + // set_shape for smaller memory - does not perform reallocation + { + orig_data = t->data(); + t->set_shape(origShape); + ASSERT_EQ(origShape, t->get_shape()); + ASSERT_EQ(orig_data, t->data()); + } +} + +TEST_F(CPUTensorExtTest, emptySize) { + ov::PartialShape pshape{0, 3, 2}; + Shape shape{pshape}; + const ov::Shape origShape({0, 3, 2}); + + std::shared_ptr t = std::make_shared(create_memory(Precision::FP32, shape)); + + ASSERT_EQ(ov::element::f32, t->get_element_type()); + ASSERT_EQ(0, t->get_size()); + ASSERT_EQ(0, t->get_byte_size()); + ASSERT_EQ(origShape, t->get_shape()); + ASSERT_EQ(byteStrides(ov::Strides({0, 0, 0}), t->get_element_type()), t->get_strides()); + EXPECT_NO_THROW(t->data()); +} + +TEST_F(CPUTensorExtTest, canCreateTensorWithDynamicShape) { + ov::PartialShape pshape{-1, 3, 2}; + Shape shape{pshape}; + + std::shared_ptr t; + + // construct with memory with dynamic shape + ASSERT_NO_THROW(t = std::make_shared(create_memory(Precision::FP32, shape))); + ASSERT_THROW(t->get_shape(), ov::Exception); + ASSERT_THROW(t->get_strides(), ov::Exception); + + // change memory to dynamic shape + { + auto memptr = create_memory(Precision::FP32, {4, 3, 2}); + ASSERT_NO_THROW(t = std::make_shared(memptr)); + + ov::PartialShape pshape{{1, 10}, 3, 2}; + CpuBlockedMemoryDescPtr desc2 = std::make_shared(Precision::FP32, Shape(pshape)); + memptr->redefineDesc(desc2); + ASSERT_THROW(t->get_shape(), ov::Exception); + ASSERT_THROW(t->get_strides(), ov::Exception); + } + + // set_shape + const ov::Shape newShape({4, 0, 2}); + ASSERT_NO_THROW(t = std::make_shared(create_memory(Precision::FP32, {4, 3, 2}))); + + const void* orig_data = t->data(); + ASSERT_NO_THROW(t->set_shape({4, 0, 2})); + ASSERT_EQ(newShape, t->get_shape()); + ASSERT_EQ(ov::Strides({0, 0, 0}), t->get_strides()); + ASSERT_EQ(orig_data, t->data()); +} + +TEST_F(CPUTensorExtTest, canSyncMemoryAndTensor) { + Shape orig_shape{4, 3, 2}; + + auto memptr = create_memory(Precision::FP32, orig_shape); + std::shared_ptr t = std::make_shared(memptr); + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byteStrides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + + // reallocate memory out boundary of tensor instance + { + Shape new_shape{1, 5, 2}; + + auto desc2 = memptr->getDescPtr()->cloneWithNewDims(new_shape.getStaticDims(), true); + memptr->redefineDesc(desc2); + ASSERT_EQ(memptr->getDescPtr()->getShape().toPartialShape().to_shape(), t->get_shape()); + ASSERT_EQ(byteStrides(memptr->getDescWithType()->getStrides(), t->get_element_type()), t->get_strides()); + } +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/tools/dump_check/dump_check.py b/src/plugins/intel_cpu/tools/dump_check/dump_check.py index 1a46a41facd..e21d6c9caa5 100644 --- a/src/plugins/intel_cpu/tools/dump_check/dump_check.py +++ b/src/plugins/intel_cpu/tools/dump_check/dump_check.py @@ -379,6 +379,10 @@ def compare_dump_file(ieb_file1, ieb_file2, visualize): else: diff_abs = np.abs(ieb1.value - ieb2.value) + if not np.all(diff_abs.shape): + print(" Shape{} has dim 0".format(ieb1.shape)) + return + max_abs = np.amax(diff_abs) max_idx = np.where(diff_abs >= max_abs) max_org = np.abs(ieb2.value)[max_idx] diff --git a/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/iteration_chaining.hpp b/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/iteration_chaining.hpp new file mode 100644 index 00000000000..a9711fd97a7 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/behavior/ov_infer_request/iteration_chaining.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include +#include +#include + +#include "base/behavior_test_utils.hpp" +#include "openvino/core/attribute_visitor.hpp" +#include "openvino/core/model.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/rank.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/type/element_type_traits.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/tensor.hpp" + +namespace ov { +namespace test { +namespace behavior { + +struct OVIterationChaining : public OVInferRequestTests { + static std::string getTestCaseName(const testing::TestParamInfo& obj); + void Run(); + + void SetUp() override; + void TearDown() override; + + ov::InferRequest req; + +private: + static std::shared_ptr getIterativeFunction(); + bool checkOutput(const ov::runtime::Tensor& in, const ov::runtime::Tensor& actual); +}; + +} // namespace behavior +} // namespace test +} // namespace ov diff --git a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp index 76b5dc2c167..d8002f3df94 100644 --- a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp +++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/infer_request_dynamic.cpp @@ -188,6 +188,36 @@ TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkSetOutputShapeBeforeInfer) ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputname))); } +TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkGetOutputThenSetOutputTensorPreAllocatedMemoryBeforeInfer) { + const std::string tensor_name = "input_tensor"; + const ov::Shape refShape = inOutShapes[0].first; + const ov::Shape refOutShape = inOutShapes[0].second; + std::map shapes; + shapes[tensor_name] = {ov::Dimension::dynamic(), 4, 20, 20}; + OV_ASSERT_NO_THROW(function->reshape(shapes)); + // Load ov::Model to target plugins + auto execNet = ie->compile_model(function, target_device, configuration); + // Create InferRequest + ov::InferRequest req; + ov::runtime::Tensor tensor; + const std::string outputname = function->outputs().back().get_any_name(); + OV_ASSERT_NO_THROW(req = execNet.create_infer_request()); + tensor = ov::test::utils::create_and_fill_tensor(element::f32, refShape, 100, -50); + OV_ASSERT_NO_THROW(req.set_tensor("input_tensor", tensor)); + // first, get ouput tensor + OV_ASSERT_NO_THROW(req.infer()); + ASSERT_EQ(req.get_tensor(outputname).get_shape(), refOutShape); + ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputname))); + // then, set output tensor + float ptr[5000]; + ov::runtime::Tensor otensor(element::f32, refOutShape, ptr); + OV_ASSERT_NO_THROW(req.set_tensor(outputname, otensor)); + OV_ASSERT_NO_THROW(req.infer()); + ASSERT_EQ(req.get_tensor(outputname).data(), ptr); + ASSERT_EQ(req.get_tensor(outputname).get_shape(), refOutShape); + ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor"), req.get_tensor(outputname))); +} + TEST_P(OVInferRequestDynamicTests, InferDynamicNetworkWithoutSetShape) { const std::string tensor_name = "input_tensor"; std::map shapes; diff --git a/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/iteration_chaining.cpp b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/iteration_chaining.cpp new file mode 100644 index 00000000000..e20eb00adc4 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/behavior/ov_infer_request/iteration_chaining.cpp @@ -0,0 +1,121 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include + +#include "base/ov_behavior_test_utils.hpp" +#include "openvino/core/attribute_visitor.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/rank.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/core/type/element_type_traits.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/core/model.hpp" +#include "ngraph_functions/builders.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/tensor.hpp" +#include "behavior/ov_infer_request/iteration_chaining.hpp" + +namespace ov { +namespace test { +namespace behavior { +std::string OVIterationChaining::getTestCaseName(const testing::TestParamInfo& obj) { + return OVInferRequestTests::getTestCaseName(obj); +} + +std::shared_ptr OVIterationChaining::getIterativeFunction() { + const ov::PartialShape pshape{-1, 16}; + auto params = ngraph::builder::makeDynamicParams(element::Type_t::f32, {pshape}); + params[0]->get_output_tensor(0).set_names({"input_tensor_0"}); + params[0]->set_friendly_name("param_0"); + auto concat_const = ngraph::builder::makeConstant(element::Type_t::f32, {1, 16}, std::vector{}, true); + auto concat = ngraph::builder::makeConcat({params[0], concat_const}, 0 /*axis*/); + auto eltwise_const = ngraph::builder::makeConstant(element::Type_t::f32, {1, 16}, std::vector{}, true); + auto eltwise = ngraph::builder::makeEltwise(concat, eltwise_const, ngraph::helpers::EltwiseTypes::ADD); + concat->get_output_tensor(0).set_names({"result_tensor_0"}); + concat->set_friendly_name("result_0"); + eltwise->get_output_tensor(0).set_names({"result_tensor_1"}); + eltwise->set_friendly_name("result_1"); + + return std::make_shared(ov::NodeVector{concat, eltwise}, ov::ParameterVector(params)); +} + +void OVIterationChaining::SetUp() { + std::tie(target_device, configuration) = this->GetParam(); + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + APIBaseTest::SetUp(); + function = getIterativeFunction(); + ov::AnyMap params; + for (auto&& v : configuration) { + params.emplace(v.first, v.second); + } + execNet = core->compile_model(function, target_device, params); + + try { + req = execNet.create_infer_request(); + } catch (const std::exception& ex) { + FAIL() << "Can't Create Infer Requiest in SetUp \nException [" << ex.what() << "]" + << std::endl; + } +} + +void OVIterationChaining::TearDown() { + req = {}; + OVInferRequestTests::TearDown(); +} + +bool OVIterationChaining::checkOutput(const ov::runtime::Tensor& in, const ov::runtime::Tensor& actual) { + bool result = true; + auto net = core->compile_model(function, CommonTestUtils::DEVICE_TEMPLATE); + ov::InferRequest req; + req = net.create_infer_request(); + auto tensor = req.get_tensor(function->inputs().back().get_any_name()); + tensor.set_shape(in.get_shape()); + for (int i = 0; i < in.get_size(); i++) { + tensor.data()[i] = in.data()[i]; + } + req.infer(); + for (int i = 0; i < actual.get_size(); i++) { + if (fabs(req.get_output_tensor(0).data()[i] - actual.data()[i]) > std::numeric_limits::epsilon()) + return false; + } + return result; +} + +void OVIterationChaining::Run() { + // perform iteration chaining by iteratively + // setting input tensor to be output tensor of last inference, and + // beginnign with an empty tensor + ov::Tensor t0(element::Type_t::f32, {0, 16}); + + OV_ASSERT_NO_THROW(req.set_tensor("input_tensor_0", t0)); + for (size_t i = 0; i < 10; i++) { + OV_ASSERT_NO_THROW(req.infer()); + ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor_0"), req.get_tensor("result_tensor_0"))); + + const auto t1 = req.get_tensor("result_tensor_0"); + OV_ASSERT_NO_THROW(req.set_tensor("input_tensor_0", t1)); + } + ASSERT_TRUE(checkOutput(req.get_tensor("input_tensor_0"), req.get_tensor("result_tensor_0"))); +} + +TEST_P(OVIterationChaining, Simple) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + Run(); +} + +} // namespace behavior +} // namespace test +} // namespace ov