Dynamic shape mem reuse solution (#11667)
* Dynamic shape memory reuse solution * Fix Split node to properly work with dyn mem * Fix race condition for Memory mgrHandle * Avoid Memory race condition between GetData and SetDataHandle Add a lock for race condition between ov::intel_cpu::Memory::GetData() and ov::intel_cpu::Memory::SetDataHandle() is not a good solution, which will impact the inference performance. We found that it is unnecessary get edge DataPtr in inferRequest::SetBlob or GetBlob, which only need the tensorDesc, so we can only get tensorDesc to replace get dataPtr to avoid this race condition. * Resolve reviewer's comments * Avoid performance impact due to frenquent reset MemMngrHandle If MemMngrHandle already has been assigned an external buffer, it can be reused. Else it need create a new one.
This commit is contained in:
@@ -63,33 +63,31 @@ public:
|
||||
int64_t id;
|
||||
};
|
||||
|
||||
explicit MemorySolver(const std::vector<Box>& boxes) : _boxes(boxes) {
|
||||
/** @brief Performes inplace normalization of the input boxes
|
||||
@return lifespan of all boxes
|
||||
*/
|
||||
static int normalizeBoxes(std::vector<Box>& boxes) {
|
||||
int max_ts = 0;
|
||||
// TODO: add validation of data correctness:
|
||||
// 1. Box.start >= 0 and Box.finish >= -1
|
||||
// 2. Box.finish >= Box.start (except Box.finish == -1)
|
||||
// 3. Box.size > 0 (or == 0 ?)
|
||||
// 4. Box.id == any unique value
|
||||
for (const Box& box : _boxes)
|
||||
for (const Box& box : boxes)
|
||||
max_ts = std::max(std::max(max_ts, box.start), box.finish);
|
||||
for (Box& box : _boxes)
|
||||
for (Box& box : boxes)
|
||||
if (box.finish == -1)
|
||||
box.finish = max_ts;
|
||||
|
||||
// sort by start and finish ts
|
||||
std::sort(_boxes.begin(), _boxes.end(), [](const Box& l, const Box& r) -> bool {
|
||||
std::sort(boxes.begin(), boxes.end(), [](const Box& l, const Box& r) -> bool {
|
||||
return l.start < r.start || (l.start == r.start && l.finish < r.finish);
|
||||
});
|
||||
|
||||
// remove unused timestamps (not a begin of some box)
|
||||
// each ts should start a box
|
||||
std::vector<bool> ts_exist(max_ts + 1);
|
||||
for (const Box& b : _boxes)
|
||||
for (const Box& b : boxes)
|
||||
ts_exist[b.start] = true;
|
||||
|
||||
int rm_ts_s = 0, rm_ts_f = 0;
|
||||
int ts_s = 0, ts_f = 0;
|
||||
for (Box& b : _boxes) {
|
||||
for (Box& b : boxes) {
|
||||
while (ts_s < b.start)
|
||||
if (!ts_exist[ts_s++])
|
||||
rm_ts_s++;
|
||||
@@ -105,7 +103,16 @@ public:
|
||||
b.start -= rm_ts_s;
|
||||
b.finish -= rm_ts_f;
|
||||
}
|
||||
_time_duration = ts_f - rm_ts_f;
|
||||
return ts_f - rm_ts_f;
|
||||
}
|
||||
|
||||
explicit MemorySolver(const std::vector<Box>& boxes) : _boxes(boxes) {
|
||||
// TODO: add validation of data correctness:
|
||||
// 1. Box.start >= 0 and Box.finish >= -1
|
||||
// 2. Box.finish >= Box.start (except Box.finish == -1)
|
||||
// 3. Box.size > 0 (or == 0 ?)
|
||||
// 4. Box.id == any unique value
|
||||
_time_duration = normalizeBoxes(_boxes);
|
||||
}
|
||||
|
||||
inline bool popupTogetherWith(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {
|
||||
|
||||
@@ -136,6 +136,12 @@ DnnlMemoryDescPtr Memory::GetDescWithType<DnnlMemoryDesc, 0, 0>() const {
|
||||
}
|
||||
|
||||
void Memory::setDataHandle(void *data) {
|
||||
if (!mgrHandle->hasExtBuffer()) {
|
||||
mgrHandle = DnnlMemMngrHandle(
|
||||
std::make_shared<DnnlMemoryMngr>(std::unique_ptr<MemoryMngrWithReuse>(new MemoryMngrWithReuse())),
|
||||
this);
|
||||
}
|
||||
|
||||
size_t maxMemSize = pMemDesc->hasDefinedMaxSize() ? pMemDesc->getMaxMemSize() : 0;
|
||||
mgrHandle->setExtBuff(data, maxMemSize);
|
||||
prim->set_data_handle(mgrHandle->getRawPtr()); // for pads zeroing, to preserve dnnl::memory::set_data_handle behaviour
|
||||
|
||||
@@ -176,6 +176,9 @@ public:
|
||||
return prim != nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Resets the memory manager to a new one created with the provided raw memory
|
||||
*/
|
||||
void setDataHandle(void* data);
|
||||
|
||||
const MemoryDesc& getDesc() const {
|
||||
|
||||
@@ -258,7 +258,7 @@ int Edge::getOutputNum() const {
|
||||
return child_port;
|
||||
}
|
||||
|
||||
void Edge::allocate(const void* mem_ptr) {
|
||||
void Edge::allocateCommon(const std::function<void(const MemoryPtr&, const MemoryDesc&)>& allocate) {
|
||||
if (status != Status::NeedAllocation)
|
||||
return;
|
||||
|
||||
@@ -272,11 +272,30 @@ void Edge::allocate(const void* mem_ptr) {
|
||||
|
||||
auto parentPtr = getParent();
|
||||
memoryPtr.reset(new Memory(parentPtr->getEngine()));
|
||||
|
||||
memoryPtr->Create(inputDesc, mem_ptr, false); // no pads zeroing
|
||||
allocate(memoryPtr, inputDesc);
|
||||
status = Status::Allocated;
|
||||
}
|
||||
|
||||
void Edge::allocate(const void* mem_ptr) {
|
||||
auto allocateFunc = [=](const MemoryPtr& memoryPtr, const MemoryDesc& inputDesc) {
|
||||
memoryPtr->Create(inputDesc, mem_ptr, false); // no pads zeroing
|
||||
};
|
||||
|
||||
allocateCommon(allocateFunc);
|
||||
}
|
||||
|
||||
void Edge::allocate(DnnlMemoryMngrPtr memMngr) {
|
||||
if (!memMngr) {
|
||||
IE_THROW(Unexpected) << "Memory manager ptr is NULL";
|
||||
}
|
||||
|
||||
auto allocateFunc = [=](const MemoryPtr& memoryPtr, const MemoryDesc& inputDesc) {
|
||||
memoryPtr->Create(inputDesc, memMngr);
|
||||
};
|
||||
|
||||
allocateCommon(allocateFunc);
|
||||
}
|
||||
|
||||
std::string Edge::name() const {
|
||||
auto parentPtr = getParent();
|
||||
auto childPtr = getChild();
|
||||
@@ -289,34 +308,10 @@ std::string Edge::name() const {
|
||||
}
|
||||
|
||||
void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) {
|
||||
auto isInPlace = [](const NodePtr node, int port) -> bool {
|
||||
const auto& selected_pd = node->getSelectedPrimitiveDescriptor();
|
||||
if (selected_pd == nullptr)
|
||||
IE_THROW() << "Preferable primitive descriptor is not set.";
|
||||
|
||||
const auto& config = selected_pd->getConfig();
|
||||
|
||||
for (const auto& in : config.inConfs) {
|
||||
if (in.inPlace() == port) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
for (const auto& out : config.outConfs) {
|
||||
if (out.inPlace() == port) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
if (status != Status::NeedAllocation)
|
||||
return;
|
||||
|
||||
bool isTheOnlyChildEdgeAtPort = getParent()->getChildEdgesAtPort(getInputNum()).size() == 1;
|
||||
bool isConcurrentUpdatePossible = isInPlace(getParent(), getInputNum()) || isInPlace(getChild(), getOutputNum()) || !isTheOnlyChildEdgeAtPort;
|
||||
|
||||
if (weightsCache && !isConcurrentUpdatePossible) {
|
||||
if (weightsCache) {
|
||||
auto alloc = [this] () {
|
||||
allocate();
|
||||
return memoryPtr;
|
||||
|
||||
@@ -51,6 +51,7 @@ public:
|
||||
|
||||
void init();
|
||||
void allocate(const void* mem_ptr = nullptr);
|
||||
void allocate(DnnlMemoryMngrPtr memMngr);
|
||||
void externalAllocate(WeightsSharing::Ptr weightsCache);
|
||||
void reuse(MemoryPtr ptr);
|
||||
void validate();
|
||||
@@ -104,6 +105,8 @@ private:
|
||||
|
||||
EdgePtr getBaseEdge(int look = LOOK_BOTH);
|
||||
bool inPlace(LOOK look = LOOK_BOTH);
|
||||
void allocateCommon(const std::function<void(const MemoryPtr&, const MemoryDesc&)>& allocate);
|
||||
|
||||
friend class Graph;
|
||||
};
|
||||
|
||||
|
||||
@@ -606,25 +606,17 @@ static edge_clusters_t findEdgeClusters(const std::vector<EdgePtr> & graphEdges)
|
||||
edge_cluster_idx_map_t edge_cluster_indices;
|
||||
|
||||
for (auto &edge : graphEdges) {
|
||||
if (!edge->hasDefinedMaxSize())
|
||||
continue;
|
||||
|
||||
auto edge_it = edge_cluster_indices.find(edge);
|
||||
|
||||
if (edge_it != edge_cluster_indices.end())
|
||||
continue; // edge is visited
|
||||
|
||||
size_t cluster_idx = edge_clusters.size();
|
||||
EdgePtr last_shared_edge = nullptr;
|
||||
//has_defined_max_path means all the edges on path from current to the actual shared edge
|
||||
//have defined max memory size so they can be added to the clusters and resolved by mem solver
|
||||
bool has_defined_max_path = true;
|
||||
|
||||
// find cluster index
|
||||
for (auto shared_edge = edge->getSharedEdge(std::nothrow);
|
||||
shared_edge;
|
||||
shared_edge = shared_edge->getSharedEdge(std::nothrow)) {
|
||||
has_defined_max_path = has_defined_max_path && shared_edge->hasDefinedMaxSize();
|
||||
auto shared_edge_it = edge_cluster_indices.find(shared_edge);
|
||||
if (shared_edge_it != edge_cluster_indices.end()) {
|
||||
cluster_idx = shared_edge_it->second;
|
||||
@@ -633,10 +625,6 @@ static edge_clusters_t findEdgeClusters(const std::vector<EdgePtr> & graphEdges)
|
||||
}
|
||||
}
|
||||
|
||||
if (!has_defined_max_path) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// add shared edges to cluster
|
||||
edge_cluster_indices.emplace(edge, cluster_idx);
|
||||
|
||||
@@ -689,22 +677,24 @@ void Graph::AllocateWithReuse() {
|
||||
|
||||
const int64_t alignment = 32; // 32 bytes
|
||||
|
||||
std::vector<MemorySolver::Box> boxes(edge_clusters.size());
|
||||
std::vector<MemorySolver::Box> definedBoxes;
|
||||
std::vector<MemorySolver::Box> undefinedBoxes;
|
||||
for (int i = 0; i < edge_clusters.size(); i++) {
|
||||
MemorySolver::Box &box = boxes[i];
|
||||
box = { std::numeric_limits<int>::max(), 0, 0, i };
|
||||
MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
|
||||
int64_t boxSize = 0;
|
||||
for (auto &edge : edge_clusters[i]) {
|
||||
int e_start = edge->getParent()->execIndex;
|
||||
int e_finish = edge->getChild()->execIndex;
|
||||
|
||||
if (!edge->hasDefinedMaxSize()) {
|
||||
IE_THROW() << "Can not allocate memory since the size is undefined.";
|
||||
if (boxSize != -1 && edge->getDesc().hasDefinedMaxSize()) {
|
||||
int64_t e_size = edge->getDesc().getMaxMemSize(); // size in bytes (from the beginning of data to the last element)
|
||||
boxSize = std::max(e_size, boxSize);
|
||||
} else {
|
||||
boxSize = -1;
|
||||
}
|
||||
|
||||
int64_t e_size = edge->getDesc().getMaxMemSize(); // size in bytes (from the beginning of data to the last element)
|
||||
box.start = std::min(e_start, box.start);
|
||||
box.finish = std::max(e_finish, box.finish);
|
||||
box.size = std::max(e_size, box.size);
|
||||
}
|
||||
|
||||
// Constant data are filled once on load.
|
||||
@@ -727,11 +717,17 @@ void Graph::AllocateWithReuse() {
|
||||
}
|
||||
}
|
||||
|
||||
box.size = div_up(box.size, alignment);
|
||||
if (boxSize != -1) {
|
||||
box.size = div_up(boxSize, alignment);
|
||||
definedBoxes.push_back(box);
|
||||
} else {
|
||||
box.size = boxSize;
|
||||
undefinedBoxes.push_back(box);
|
||||
}
|
||||
}
|
||||
|
||||
MemorySolver memSolver(boxes);
|
||||
size_t total_size = static_cast<size_t>(memSolver.solve()) * alignment;
|
||||
MemorySolver staticMemSolver(definedBoxes);
|
||||
size_t total_size = static_cast<size_t>(staticMemSolver.solve()) * alignment;
|
||||
|
||||
memWorkspace = std::make_shared<Memory>(eng);
|
||||
memWorkspace->Create(DnnlBlockedMemoryDesc(InferenceEngine::Precision::I8, Shape(InferenceEngine::SizeVector{total_size})));
|
||||
@@ -741,11 +737,11 @@ void Graph::AllocateWithReuse() {
|
||||
|
||||
auto* workspace_ptr = static_cast<int8_t*>(memWorkspace->GetData());
|
||||
|
||||
for (int i = 0; i < edge_clusters.size(); i++) {
|
||||
for (auto& box : definedBoxes) {
|
||||
int count = 0;
|
||||
for (auto &edge : edge_clusters[i]) {
|
||||
for (auto& edge : edge_clusters[box.id]) {
|
||||
if (edge->getStatus() == Edge::Status::NeedAllocation) {
|
||||
int64_t offset = memSolver.getOffset(i);
|
||||
int64_t offset = staticMemSolver.getOffset(box.id);
|
||||
// !! Fallback to individual memory allocation !!
|
||||
// if you like to check infer without reuse just call this function without arguments.
|
||||
edge->allocate(workspace_ptr + offset * alignment); // alignment in byte
|
||||
@@ -761,6 +757,40 @@ void Graph::AllocateWithReuse() {
|
||||
}
|
||||
IE_ASSERT(count == 1);
|
||||
}
|
||||
|
||||
if (!undefinedBoxes.empty()) {
|
||||
MemorySolver::normalizeBoxes(undefinedBoxes);
|
||||
|
||||
std::vector<std::vector<MemorySolver::Box>> groups; //groups of nonoverlapping boxes
|
||||
groups.push_back({undefinedBoxes.front()});
|
||||
for (size_t i = 1; i < undefinedBoxes.size(); ++i) {
|
||||
const auto& box = undefinedBoxes[i];
|
||||
bool groupFound = false;
|
||||
for (auto& group : groups) {
|
||||
const auto& lastBox = group.back();
|
||||
if (lastBox.start > box.finish || lastBox.finish < box.start) {
|
||||
group.push_back(box);
|
||||
groupFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!groupFound) {
|
||||
groups.push_back({box});
|
||||
}
|
||||
}
|
||||
for (auto& group : groups) {
|
||||
auto grpMemMngr =
|
||||
std::make_shared<DnnlMemoryMngr>(std::unique_ptr<MemoryMngrWithReuse>(new MemoryMngrWithReuse()));
|
||||
for (auto& box : group) {
|
||||
for (auto& edge : edge_clusters[box.id]) {
|
||||
if (edge->getStatus() == Edge::Status::NeedAllocation) {
|
||||
edge->allocate(grpMemMngr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Graph::Allocate() {
|
||||
@@ -774,9 +804,6 @@ void Graph::Allocate() {
|
||||
// Allocate memory space for all edges marked with NeedAllocation
|
||||
AllocateWithReuse();
|
||||
|
||||
// Create dummy memory with undefined desc for edges that are need allocation but has not been allocated withing mem solver
|
||||
for (auto& edge : graphEdges) edge->allocate();
|
||||
|
||||
// Resolve all other edges with status NotAllocated and in-place
|
||||
for (auto& node : graphNodes) node->resolveInPlaceEdges();
|
||||
|
||||
|
||||
@@ -431,12 +431,8 @@ void LegacyInferRequest::SetBlob(const std::string& name, const InferenceEngine:
|
||||
IE_THROW(ParameterMismatch) << "Failed to set input blob. Blocking descriptor mismatch.";
|
||||
}
|
||||
|
||||
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
|
||||
if (!pBlob) {
|
||||
IE_THROW() << "Blob returned after trying to interpret input node's memory is nullable. Input node name: " << name;
|
||||
}
|
||||
|
||||
if (data->getTensorDesc() == pBlob->getTensorDesc() &&
|
||||
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
|
||||
if (data->getTensorDesc() == pBlobDesc &&
|
||||
graph->_normalizePreprocMap.find(name) == graph->_normalizePreprocMap.end() && !graph->getProperty().batchLimit) {
|
||||
externalPtr[name] = data->buffer();
|
||||
} else if (externalPtr.find(name) != externalPtr.end()) {
|
||||
@@ -469,11 +465,8 @@ void LegacyInferRequest::SetBlob(const std::string& name, const InferenceEngine:
|
||||
IE_THROW(ParameterMismatch) << "Failed to set output blob. Blocking descriptor mismatch.";
|
||||
}
|
||||
|
||||
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
|
||||
if (!pBlob)
|
||||
IE_THROW() << "Blob returned after trying to interpret output node's memory is nullable. Output node name: " << name;
|
||||
|
||||
if (data->getTensorDesc() == pBlob->getTensorDesc() &&
|
||||
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
|
||||
if (data->getTensorDesc() == pBlobDesc &&
|
||||
!graph->getProperty().batchLimit) {
|
||||
externalPtr[name] = data->buffer();
|
||||
} else if (externalPtr.find(name) != externalPtr.end()) {
|
||||
@@ -502,12 +495,8 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
|
||||
}
|
||||
|
||||
if (_inputs.find(name) == _inputs.end()) {
|
||||
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
|
||||
if (!pBlob) {
|
||||
IE_THROW() << "Blob returned after trying to interpret input node's memory is nullable. Input node name: " << name;
|
||||
}
|
||||
|
||||
InferenceEngine::TensorDesc desc = pBlob->getTensorDesc();
|
||||
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
|
||||
InferenceEngine::TensorDesc desc = pBlobDesc;
|
||||
|
||||
if (_networkInputs.find(name) != _networkInputs.end()) {
|
||||
InferenceEngine::Layout l = _networkInputs[name]->getLayout();
|
||||
@@ -519,7 +508,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
|
||||
|
||||
_inputs[name] = make_blob_with_precision(desc);
|
||||
_inputs[name]->allocate();
|
||||
if (pBlob->getTensorDesc() == desc &&
|
||||
if (pBlobDesc == desc &&
|
||||
graph->_normalizePreprocMap.find(name) == graph->_normalizePreprocMap.end() && !graph->getProperty().batchLimit) {
|
||||
externalPtr[name] = _inputs[name]->buffer();
|
||||
}
|
||||
@@ -547,11 +536,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
|
||||
|
||||
if (graph->hasOutputWithName(name)) {
|
||||
if (_outputs.find(name) == _outputs.end()) {
|
||||
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
|
||||
if (!pBlob) {
|
||||
IE_THROW() << "Blob returned after trying to interpret output node's memory is nullable. Output node name: " << name;
|
||||
}
|
||||
|
||||
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
|
||||
if (!data) {
|
||||
InferenceEngine::TensorDesc desc = _networkOutputs[name]->getTensorDesc();
|
||||
desc.setPrecision(normalizeToSupportedPrecision(desc.getPrecision()));
|
||||
@@ -566,7 +551,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
|
||||
data = make_blob_with_precision(desc);
|
||||
data->allocate();
|
||||
} else {
|
||||
const auto& expectedTensorDesc = pBlob->getTensorDesc();
|
||||
const auto& expectedTensorDesc = pBlobDesc;
|
||||
|
||||
if (expectedTensorDesc.getPrecision() != data->getTensorDesc().getPrecision()) {
|
||||
IE_THROW(ParameterMismatch) << "Network input and output use the same name: " << name << " but expect blobs with different precision: "
|
||||
@@ -586,7 +571,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
|
||||
}
|
||||
|
||||
_outputs[name] = data;
|
||||
if (!externalPtr.count(name) && data->getTensorDesc() == pBlob->getTensorDesc() && !graph->getProperty().batchLimit) {
|
||||
if (!externalPtr.count(name) && data->getTensorDesc() == pBlobDesc && !graph->getProperty().batchLimit) {
|
||||
externalPtr[name] = data->buffer();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,6 +98,13 @@ InferenceEngine::Blob::Ptr MemoryDescUtils::interpretAsBlob(const Memory &mem) {
|
||||
return make_blob_with_precision(desc, mem.GetData());
|
||||
}
|
||||
|
||||
InferenceEngine::TensorDesc MemoryDescUtils::interpretAsBlobDesc(const Memory &mem) {
|
||||
auto& memDesc = mem.getDesc();
|
||||
InferenceEngine::TensorDesc desc = convertToTensorDesc(memDesc);
|
||||
|
||||
return InferenceEngine::TensorDesc(desc.getPrecision(), memDesc.getShape().getStaticDims(), desc.getBlockingDesc());
|
||||
}
|
||||
|
||||
InferenceEngine::TensorDesc MemoryDescUtils::convertToTensorDesc(const MemoryDesc& desc) {
|
||||
if (auto blockingDesc = dynamic_cast<const BlockedMemoryDesc*>(&desc)) {
|
||||
InferenceEngine::BlockingDesc blkDesc = desc.getShape().hasZeroDims() ? InferenceEngine::BlockingDesc(blockingDesc->getBlockDims(),
|
||||
|
||||
@@ -66,6 +66,13 @@ public:
|
||||
*/
|
||||
static InferenceEngine::Blob::Ptr interpretAsBlob(const Memory& mem);
|
||||
|
||||
/**
|
||||
* @brief Creates InferenceEngine::TensorDesc from Memory with the memory reuse
|
||||
* @param desc Memory from which will be created InferenceEngine::Blob
|
||||
* @return InferenceEngine::TensorDesc
|
||||
*/
|
||||
static InferenceEngine::TensorDesc interpretAsBlobDesc(const Memory& mem);
|
||||
|
||||
/**
|
||||
* @brief Converts MemoryDesc to InferenceEngine::TensorDesc
|
||||
* @param desc MemoryDesc to be converted
|
||||
|
||||
@@ -266,11 +266,7 @@ void Split::prepareParams() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (uint8_t* dstData = reinterpret_cast<uint8_t*>(outMemPtr->GetPtr())) {
|
||||
dstMemPtrs.emplace_back(port, dstData);
|
||||
} else {
|
||||
THROW_ERROR << "can't get child edge indx " << port << "data.";
|
||||
}
|
||||
dstMemPtrs.emplace_back(port, outMemPtr);
|
||||
|
||||
if (!canUseOptimizedNspc2Ncsp) {
|
||||
outDescs.push_back(outMemPtr->GetDescWithType<BlockedMemoryDesc>());
|
||||
@@ -306,7 +302,7 @@ void Split::execute(dnnl::stream strm) {
|
||||
|
||||
uint8_t* srcData = reinterpret_cast<uint8_t*>(srcMem.GetPtr());
|
||||
IE_ASSERT(execPtr != nullptr);
|
||||
execPtr->exec(srcData, dstMemPtrs, batch, MB);
|
||||
execPtr->exec(srcData, getRawDstMemPtrs(), batch, MB);
|
||||
}
|
||||
|
||||
bool Split::created() const {
|
||||
@@ -506,7 +502,7 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
|
||||
const size_t strideOC = DHW * dataSize;
|
||||
|
||||
for (size_t i = 0, sIdx = 0; i < dstMemPtrs.size(); i++) {
|
||||
auto dstData = dstMemPtrs[i].second;
|
||||
auto dstData = reinterpret_cast<uint8_t*>(dstMemPtrs[i].second->GetPtr());
|
||||
|
||||
size_t innerSize = 1;
|
||||
auto dims = getChildEdgesAtPort(dstMemPtrs[i].first)[0]->getMemory().getStaticDims();
|
||||
@@ -533,6 +529,17 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint8_t*> Split::getRawDstMemPtrs() const {
|
||||
std::vector<uint8_t*> result(dstMemPtrs.size());
|
||||
for (size_t i = 0; i < dstMemPtrs.size(); ++i) {
|
||||
result[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i].second->GetPtr());
|
||||
if (!result[i]) {
|
||||
THROW_ERROR << "can't get child edge indx " << dstMemPtrs[i].first << " data.";
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Split::SplitOptimizedExecutor::SplitOptimizedExecutor(BlockedMemoryDescCPtr inDesc, const std::vector<BlockedMemoryDescCPtr> &outDescs,
|
||||
const size_t axis) {
|
||||
// find axis order position
|
||||
@@ -576,14 +583,14 @@ Split::SplitOptimizedExecutor::SplitOptimizedExecutor(BlockedMemoryDescCPtr inDe
|
||||
}
|
||||
}
|
||||
|
||||
void Split::SplitOptimizedExecutor::exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
|
||||
void Split::SplitOptimizedExecutor::exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
|
||||
const Dim origBatch, const Dim perInferBatch) {
|
||||
size_t execCountStrides = countStrides;
|
||||
if (origBatch != perInferBatch)
|
||||
execCountStrides = execCountStrides / origBatch * perInferBatch;
|
||||
|
||||
parallel_for2d(dstMemPtrs.size(), execCountStrides, [&](size_t i, size_t j) {
|
||||
uint8_t* dstData = dstMemPtrs[i].second;
|
||||
parallel_for2d(dstRawMemPtrs.size(), execCountStrides, [&](size_t i, size_t j) {
|
||||
uint8_t* dstData = dstRawMemPtrs[i];
|
||||
|
||||
cpu_memcpy(&dstData[j * dataSize[i]],
|
||||
&srcData[srcDataOffsets[i] + j * srcDataStride],
|
||||
|
||||
@@ -36,7 +36,7 @@ public:
|
||||
|
||||
private:
|
||||
struct SplitExecutor {
|
||||
virtual void exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
|
||||
virtual void exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
|
||||
const Dim origBatch, const Dim perInferBatch) = 0;
|
||||
virtual ~SplitExecutor() = default;
|
||||
};
|
||||
@@ -45,7 +45,7 @@ private:
|
||||
struct SplitOptimizedExecutor : public SplitExecutor {
|
||||
public:
|
||||
SplitOptimizedExecutor(BlockedMemoryDescCPtr inDesc, const std::vector<BlockedMemoryDescCPtr> &outDescs, const size_t axis);
|
||||
void exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
|
||||
void exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
|
||||
const Dim origBatch, const Dim perInferBatch) override;
|
||||
|
||||
private:
|
||||
@@ -56,11 +56,12 @@ private:
|
||||
};
|
||||
|
||||
void optimizedNspc2Ncsp(size_t MB);
|
||||
std::vector<uint8_t*> getRawDstMemPtrs() const;
|
||||
|
||||
bool canUseOptimizedNspc2Ncsp = false;
|
||||
|
||||
size_t axis = 1;
|
||||
std::vector<std::pair<size_t, uint8_t*>> dstMemPtrs;
|
||||
std::vector<std::pair<size_t, MemoryCPtr>> dstMemPtrs;
|
||||
|
||||
size_t INPUTS_NUM = 2;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user