Dynamic shape mem reuse solution (#11667)

* Dynamic shape memory reuse solution

* Fix Split node to properly work with dyn mem

* Fix race condition for Memory mgrHandle

* Avoid Memory race condition between GetData and SetDataHandle

Add a lock for race condition between  ov::intel_cpu::Memory::GetData() and ov::intel_cpu::Memory::SetDataHandle() is not a good solution,
which will impact the inference performance. We found that it is unnecessary get edge DataPtr in inferRequest::SetBlob or GetBlob, which
only need the tensorDesc, so we can only get tensorDesc to replace get dataPtr to avoid this race condition.

* Resolve reviewer's comments

* Avoid performance impact due to frenquent reset MemMngrHandle

If MemMngrHandle already has been assigned an external buffer, it can be reused.
Else it need create a new one.
This commit is contained in:
River Li
2022-06-01 18:49:47 +08:00
committed by GitHub
parent 8b1ed3d5b2
commit 042bd7274a
11 changed files with 154 additions and 106 deletions

View File

@@ -63,33 +63,31 @@ public:
int64_t id;
};
explicit MemorySolver(const std::vector<Box>& boxes) : _boxes(boxes) {
/** @brief Performes inplace normalization of the input boxes
@return lifespan of all boxes
*/
static int normalizeBoxes(std::vector<Box>& boxes) {
int max_ts = 0;
// TODO: add validation of data correctness:
// 1. Box.start >= 0 and Box.finish >= -1
// 2. Box.finish >= Box.start (except Box.finish == -1)
// 3. Box.size > 0 (or == 0 ?)
// 4. Box.id == any unique value
for (const Box& box : _boxes)
for (const Box& box : boxes)
max_ts = std::max(std::max(max_ts, box.start), box.finish);
for (Box& box : _boxes)
for (Box& box : boxes)
if (box.finish == -1)
box.finish = max_ts;
// sort by start and finish ts
std::sort(_boxes.begin(), _boxes.end(), [](const Box& l, const Box& r) -> bool {
std::sort(boxes.begin(), boxes.end(), [](const Box& l, const Box& r) -> bool {
return l.start < r.start || (l.start == r.start && l.finish < r.finish);
});
// remove unused timestamps (not a begin of some box)
// each ts should start a box
std::vector<bool> ts_exist(max_ts + 1);
for (const Box& b : _boxes)
for (const Box& b : boxes)
ts_exist[b.start] = true;
int rm_ts_s = 0, rm_ts_f = 0;
int ts_s = 0, ts_f = 0;
for (Box& b : _boxes) {
for (Box& b : boxes) {
while (ts_s < b.start)
if (!ts_exist[ts_s++])
rm_ts_s++;
@@ -105,7 +103,16 @@ public:
b.start -= rm_ts_s;
b.finish -= rm_ts_f;
}
_time_duration = ts_f - rm_ts_f;
return ts_f - rm_ts_f;
}
explicit MemorySolver(const std::vector<Box>& boxes) : _boxes(boxes) {
// TODO: add validation of data correctness:
// 1. Box.start >= 0 and Box.finish >= -1
// 2. Box.finish >= Box.start (except Box.finish == -1)
// 3. Box.size > 0 (or == 0 ?)
// 4. Box.id == any unique value
_time_duration = normalizeBoxes(_boxes);
}
inline bool popupTogetherWith(MemorySolver::Box& box_new, const MemorySolver::Box& box_old) {

View File

@@ -136,6 +136,12 @@ DnnlMemoryDescPtr Memory::GetDescWithType<DnnlMemoryDesc, 0, 0>() const {
}
void Memory::setDataHandle(void *data) {
if (!mgrHandle->hasExtBuffer()) {
mgrHandle = DnnlMemMngrHandle(
std::make_shared<DnnlMemoryMngr>(std::unique_ptr<MemoryMngrWithReuse>(new MemoryMngrWithReuse())),
this);
}
size_t maxMemSize = pMemDesc->hasDefinedMaxSize() ? pMemDesc->getMaxMemSize() : 0;
mgrHandle->setExtBuff(data, maxMemSize);
prim->set_data_handle(mgrHandle->getRawPtr()); // for pads zeroing, to preserve dnnl::memory::set_data_handle behaviour

View File

@@ -176,6 +176,9 @@ public:
return prim != nullptr;
}
/**
* @brief Resets the memory manager to a new one created with the provided raw memory
*/
void setDataHandle(void* data);
const MemoryDesc& getDesc() const {

View File

@@ -258,7 +258,7 @@ int Edge::getOutputNum() const {
return child_port;
}
void Edge::allocate(const void* mem_ptr) {
void Edge::allocateCommon(const std::function<void(const MemoryPtr&, const MemoryDesc&)>& allocate) {
if (status != Status::NeedAllocation)
return;
@@ -272,11 +272,30 @@ void Edge::allocate(const void* mem_ptr) {
auto parentPtr = getParent();
memoryPtr.reset(new Memory(parentPtr->getEngine()));
memoryPtr->Create(inputDesc, mem_ptr, false); // no pads zeroing
allocate(memoryPtr, inputDesc);
status = Status::Allocated;
}
void Edge::allocate(const void* mem_ptr) {
auto allocateFunc = [=](const MemoryPtr& memoryPtr, const MemoryDesc& inputDesc) {
memoryPtr->Create(inputDesc, mem_ptr, false); // no pads zeroing
};
allocateCommon(allocateFunc);
}
void Edge::allocate(DnnlMemoryMngrPtr memMngr) {
if (!memMngr) {
IE_THROW(Unexpected) << "Memory manager ptr is NULL";
}
auto allocateFunc = [=](const MemoryPtr& memoryPtr, const MemoryDesc& inputDesc) {
memoryPtr->Create(inputDesc, memMngr);
};
allocateCommon(allocateFunc);
}
std::string Edge::name() const {
auto parentPtr = getParent();
auto childPtr = getChild();
@@ -289,34 +308,10 @@ std::string Edge::name() const {
}
void Edge::externalAllocate(WeightsSharing::Ptr weightsCache) {
auto isInPlace = [](const NodePtr node, int port) -> bool {
const auto& selected_pd = node->getSelectedPrimitiveDescriptor();
if (selected_pd == nullptr)
IE_THROW() << "Preferable primitive descriptor is not set.";
const auto& config = selected_pd->getConfig();
for (const auto& in : config.inConfs) {
if (in.inPlace() == port) {
return true;
}
}
for (const auto& out : config.outConfs) {
if (out.inPlace() == port) {
return true;
}
}
return false;
};
if (status != Status::NeedAllocation)
return;
bool isTheOnlyChildEdgeAtPort = getParent()->getChildEdgesAtPort(getInputNum()).size() == 1;
bool isConcurrentUpdatePossible = isInPlace(getParent(), getInputNum()) || isInPlace(getChild(), getOutputNum()) || !isTheOnlyChildEdgeAtPort;
if (weightsCache && !isConcurrentUpdatePossible) {
if (weightsCache) {
auto alloc = [this] () {
allocate();
return memoryPtr;

View File

@@ -51,6 +51,7 @@ public:
void init();
void allocate(const void* mem_ptr = nullptr);
void allocate(DnnlMemoryMngrPtr memMngr);
void externalAllocate(WeightsSharing::Ptr weightsCache);
void reuse(MemoryPtr ptr);
void validate();
@@ -104,6 +105,8 @@ private:
EdgePtr getBaseEdge(int look = LOOK_BOTH);
bool inPlace(LOOK look = LOOK_BOTH);
void allocateCommon(const std::function<void(const MemoryPtr&, const MemoryDesc&)>& allocate);
friend class Graph;
};

View File

@@ -606,25 +606,17 @@ static edge_clusters_t findEdgeClusters(const std::vector<EdgePtr> & graphEdges)
edge_cluster_idx_map_t edge_cluster_indices;
for (auto &edge : graphEdges) {
if (!edge->hasDefinedMaxSize())
continue;
auto edge_it = edge_cluster_indices.find(edge);
if (edge_it != edge_cluster_indices.end())
continue; // edge is visited
size_t cluster_idx = edge_clusters.size();
EdgePtr last_shared_edge = nullptr;
//has_defined_max_path means all the edges on path from current to the actual shared edge
//have defined max memory size so they can be added to the clusters and resolved by mem solver
bool has_defined_max_path = true;
// find cluster index
for (auto shared_edge = edge->getSharedEdge(std::nothrow);
shared_edge;
shared_edge = shared_edge->getSharedEdge(std::nothrow)) {
has_defined_max_path = has_defined_max_path && shared_edge->hasDefinedMaxSize();
auto shared_edge_it = edge_cluster_indices.find(shared_edge);
if (shared_edge_it != edge_cluster_indices.end()) {
cluster_idx = shared_edge_it->second;
@@ -633,10 +625,6 @@ static edge_clusters_t findEdgeClusters(const std::vector<EdgePtr> & graphEdges)
}
}
if (!has_defined_max_path) {
continue;
}
// add shared edges to cluster
edge_cluster_indices.emplace(edge, cluster_idx);
@@ -689,22 +677,24 @@ void Graph::AllocateWithReuse() {
const int64_t alignment = 32; // 32 bytes
std::vector<MemorySolver::Box> boxes(edge_clusters.size());
std::vector<MemorySolver::Box> definedBoxes;
std::vector<MemorySolver::Box> undefinedBoxes;
for (int i = 0; i < edge_clusters.size(); i++) {
MemorySolver::Box &box = boxes[i];
box = { std::numeric_limits<int>::max(), 0, 0, i };
MemorySolver::Box box = { std::numeric_limits<int>::max(), 0, 0, i };
int64_t boxSize = 0;
for (auto &edge : edge_clusters[i]) {
int e_start = edge->getParent()->execIndex;
int e_finish = edge->getChild()->execIndex;
if (!edge->hasDefinedMaxSize()) {
IE_THROW() << "Can not allocate memory since the size is undefined.";
if (boxSize != -1 && edge->getDesc().hasDefinedMaxSize()) {
int64_t e_size = edge->getDesc().getMaxMemSize(); // size in bytes (from the beginning of data to the last element)
boxSize = std::max(e_size, boxSize);
} else {
boxSize = -1;
}
int64_t e_size = edge->getDesc().getMaxMemSize(); // size in bytes (from the beginning of data to the last element)
box.start = std::min(e_start, box.start);
box.finish = std::max(e_finish, box.finish);
box.size = std::max(e_size, box.size);
}
// Constant data are filled once on load.
@@ -727,11 +717,17 @@ void Graph::AllocateWithReuse() {
}
}
box.size = div_up(box.size, alignment);
if (boxSize != -1) {
box.size = div_up(boxSize, alignment);
definedBoxes.push_back(box);
} else {
box.size = boxSize;
undefinedBoxes.push_back(box);
}
}
MemorySolver memSolver(boxes);
size_t total_size = static_cast<size_t>(memSolver.solve()) * alignment;
MemorySolver staticMemSolver(definedBoxes);
size_t total_size = static_cast<size_t>(staticMemSolver.solve()) * alignment;
memWorkspace = std::make_shared<Memory>(eng);
memWorkspace->Create(DnnlBlockedMemoryDesc(InferenceEngine::Precision::I8, Shape(InferenceEngine::SizeVector{total_size})));
@@ -741,11 +737,11 @@ void Graph::AllocateWithReuse() {
auto* workspace_ptr = static_cast<int8_t*>(memWorkspace->GetData());
for (int i = 0; i < edge_clusters.size(); i++) {
for (auto& box : definedBoxes) {
int count = 0;
for (auto &edge : edge_clusters[i]) {
for (auto& edge : edge_clusters[box.id]) {
if (edge->getStatus() == Edge::Status::NeedAllocation) {
int64_t offset = memSolver.getOffset(i);
int64_t offset = staticMemSolver.getOffset(box.id);
// !! Fallback to individual memory allocation !!
// if you like to check infer without reuse just call this function without arguments.
edge->allocate(workspace_ptr + offset * alignment); // alignment in byte
@@ -761,6 +757,40 @@ void Graph::AllocateWithReuse() {
}
IE_ASSERT(count == 1);
}
if (!undefinedBoxes.empty()) {
MemorySolver::normalizeBoxes(undefinedBoxes);
std::vector<std::vector<MemorySolver::Box>> groups; //groups of nonoverlapping boxes
groups.push_back({undefinedBoxes.front()});
for (size_t i = 1; i < undefinedBoxes.size(); ++i) {
const auto& box = undefinedBoxes[i];
bool groupFound = false;
for (auto& group : groups) {
const auto& lastBox = group.back();
if (lastBox.start > box.finish || lastBox.finish < box.start) {
group.push_back(box);
groupFound = true;
break;
}
}
if (!groupFound) {
groups.push_back({box});
}
}
for (auto& group : groups) {
auto grpMemMngr =
std::make_shared<DnnlMemoryMngr>(std::unique_ptr<MemoryMngrWithReuse>(new MemoryMngrWithReuse()));
for (auto& box : group) {
for (auto& edge : edge_clusters[box.id]) {
if (edge->getStatus() == Edge::Status::NeedAllocation) {
edge->allocate(grpMemMngr);
}
}
}
}
}
}
void Graph::Allocate() {
@@ -774,9 +804,6 @@ void Graph::Allocate() {
// Allocate memory space for all edges marked with NeedAllocation
AllocateWithReuse();
// Create dummy memory with undefined desc for edges that are need allocation but has not been allocated withing mem solver
for (auto& edge : graphEdges) edge->allocate();
// Resolve all other edges with status NotAllocated and in-place
for (auto& node : graphNodes) node->resolveInPlaceEdges();

View File

@@ -431,12 +431,8 @@ void LegacyInferRequest::SetBlob(const std::string& name, const InferenceEngine:
IE_THROW(ParameterMismatch) << "Failed to set input blob. Blocking descriptor mismatch.";
}
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
if (!pBlob) {
IE_THROW() << "Blob returned after trying to interpret input node's memory is nullable. Input node name: " << name;
}
if (data->getTensorDesc() == pBlob->getTensorDesc() &&
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
if (data->getTensorDesc() == pBlobDesc &&
graph->_normalizePreprocMap.find(name) == graph->_normalizePreprocMap.end() && !graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
} else if (externalPtr.find(name) != externalPtr.end()) {
@@ -469,11 +465,8 @@ void LegacyInferRequest::SetBlob(const std::string& name, const InferenceEngine:
IE_THROW(ParameterMismatch) << "Failed to set output blob. Blocking descriptor mismatch.";
}
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
if (!pBlob)
IE_THROW() << "Blob returned after trying to interpret output node's memory is nullable. Output node name: " << name;
if (data->getTensorDesc() == pBlob->getTensorDesc() &&
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
if (data->getTensorDesc() == pBlobDesc &&
!graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
} else if (externalPtr.find(name) != externalPtr.end()) {
@@ -502,12 +495,8 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
}
if (_inputs.find(name) == _inputs.end()) {
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
if (!pBlob) {
IE_THROW() << "Blob returned after trying to interpret input node's memory is nullable. Input node name: " << name;
}
InferenceEngine::TensorDesc desc = pBlob->getTensorDesc();
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getInputNodeByName(name)->getChildEdgesAtPort(0)[0]->getMemory());
InferenceEngine::TensorDesc desc = pBlobDesc;
if (_networkInputs.find(name) != _networkInputs.end()) {
InferenceEngine::Layout l = _networkInputs[name]->getLayout();
@@ -519,7 +508,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
_inputs[name] = make_blob_with_precision(desc);
_inputs[name]->allocate();
if (pBlob->getTensorDesc() == desc &&
if (pBlobDesc == desc &&
graph->_normalizePreprocMap.find(name) == graph->_normalizePreprocMap.end() && !graph->getProperty().batchLimit) {
externalPtr[name] = _inputs[name]->buffer();
}
@@ -547,11 +536,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
if (graph->hasOutputWithName(name)) {
if (_outputs.find(name) == _outputs.end()) {
auto pBlob = MemoryDescUtils::interpretAsBlob(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
if (!pBlob) {
IE_THROW() << "Blob returned after trying to interpret output node's memory is nullable. Output node name: " << name;
}
auto pBlobDesc = MemoryDescUtils::interpretAsBlobDesc(graph->getOutputNodeByName(name)->getParentEdgesAtPort(0)[0]->getMemory());
if (!data) {
InferenceEngine::TensorDesc desc = _networkOutputs[name]->getTensorDesc();
desc.setPrecision(normalizeToSupportedPrecision(desc.getPrecision()));
@@ -566,7 +551,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
data = make_blob_with_precision(desc);
data->allocate();
} else {
const auto& expectedTensorDesc = pBlob->getTensorDesc();
const auto& expectedTensorDesc = pBlobDesc;
if (expectedTensorDesc.getPrecision() != data->getTensorDesc().getPrecision()) {
IE_THROW(ParameterMismatch) << "Network input and output use the same name: " << name << " but expect blobs with different precision: "
@@ -586,7 +571,7 @@ InferenceEngine::Blob::Ptr LegacyInferRequest::GetBlob(const std::string& name)
}
_outputs[name] = data;
if (!externalPtr.count(name) && data->getTensorDesc() == pBlob->getTensorDesc() && !graph->getProperty().batchLimit) {
if (!externalPtr.count(name) && data->getTensorDesc() == pBlobDesc && !graph->getProperty().batchLimit) {
externalPtr[name] = data->buffer();
}
}

View File

@@ -98,6 +98,13 @@ InferenceEngine::Blob::Ptr MemoryDescUtils::interpretAsBlob(const Memory &mem) {
return make_blob_with_precision(desc, mem.GetData());
}
InferenceEngine::TensorDesc MemoryDescUtils::interpretAsBlobDesc(const Memory &mem) {
auto& memDesc = mem.getDesc();
InferenceEngine::TensorDesc desc = convertToTensorDesc(memDesc);
return InferenceEngine::TensorDesc(desc.getPrecision(), memDesc.getShape().getStaticDims(), desc.getBlockingDesc());
}
InferenceEngine::TensorDesc MemoryDescUtils::convertToTensorDesc(const MemoryDesc& desc) {
if (auto blockingDesc = dynamic_cast<const BlockedMemoryDesc*>(&desc)) {
InferenceEngine::BlockingDesc blkDesc = desc.getShape().hasZeroDims() ? InferenceEngine::BlockingDesc(blockingDesc->getBlockDims(),

View File

@@ -66,6 +66,13 @@ public:
*/
static InferenceEngine::Blob::Ptr interpretAsBlob(const Memory& mem);
/**
* @brief Creates InferenceEngine::TensorDesc from Memory with the memory reuse
* @param desc Memory from which will be created InferenceEngine::Blob
* @return InferenceEngine::TensorDesc
*/
static InferenceEngine::TensorDesc interpretAsBlobDesc(const Memory& mem);
/**
* @brief Converts MemoryDesc to InferenceEngine::TensorDesc
* @param desc MemoryDesc to be converted

View File

@@ -266,11 +266,7 @@ void Split::prepareParams() {
continue;
}
if (uint8_t* dstData = reinterpret_cast<uint8_t*>(outMemPtr->GetPtr())) {
dstMemPtrs.emplace_back(port, dstData);
} else {
THROW_ERROR << "can't get child edge indx " << port << "data.";
}
dstMemPtrs.emplace_back(port, outMemPtr);
if (!canUseOptimizedNspc2Ncsp) {
outDescs.push_back(outMemPtr->GetDescWithType<BlockedMemoryDesc>());
@@ -306,7 +302,7 @@ void Split::execute(dnnl::stream strm) {
uint8_t* srcData = reinterpret_cast<uint8_t*>(srcMem.GetPtr());
IE_ASSERT(execPtr != nullptr);
execPtr->exec(srcData, dstMemPtrs, batch, MB);
execPtr->exec(srcData, getRawDstMemPtrs(), batch, MB);
}
bool Split::created() const {
@@ -506,7 +502,7 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
const size_t strideOC = DHW * dataSize;
for (size_t i = 0, sIdx = 0; i < dstMemPtrs.size(); i++) {
auto dstData = dstMemPtrs[i].second;
auto dstData = reinterpret_cast<uint8_t*>(dstMemPtrs[i].second->GetPtr());
size_t innerSize = 1;
auto dims = getChildEdgesAtPort(dstMemPtrs[i].first)[0]->getMemory().getStaticDims();
@@ -533,6 +529,17 @@ void Split::optimizedNspc2Ncsp(size_t MB) {
}
}
std::vector<uint8_t*> Split::getRawDstMemPtrs() const {
std::vector<uint8_t*> result(dstMemPtrs.size());
for (size_t i = 0; i < dstMemPtrs.size(); ++i) {
result[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i].second->GetPtr());
if (!result[i]) {
THROW_ERROR << "can't get child edge indx " << dstMemPtrs[i].first << " data.";
}
}
return result;
}
Split::SplitOptimizedExecutor::SplitOptimizedExecutor(BlockedMemoryDescCPtr inDesc, const std::vector<BlockedMemoryDescCPtr> &outDescs,
const size_t axis) {
// find axis order position
@@ -576,14 +583,14 @@ Split::SplitOptimizedExecutor::SplitOptimizedExecutor(BlockedMemoryDescCPtr inDe
}
}
void Split::SplitOptimizedExecutor::exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
void Split::SplitOptimizedExecutor::exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
const Dim origBatch, const Dim perInferBatch) {
size_t execCountStrides = countStrides;
if (origBatch != perInferBatch)
execCountStrides = execCountStrides / origBatch * perInferBatch;
parallel_for2d(dstMemPtrs.size(), execCountStrides, [&](size_t i, size_t j) {
uint8_t* dstData = dstMemPtrs[i].second;
parallel_for2d(dstRawMemPtrs.size(), execCountStrides, [&](size_t i, size_t j) {
uint8_t* dstData = dstRawMemPtrs[i];
cpu_memcpy(&dstData[j * dataSize[i]],
&srcData[srcDataOffsets[i] + j * srcDataStride],

View File

@@ -36,7 +36,7 @@ public:
private:
struct SplitExecutor {
virtual void exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
virtual void exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
const Dim origBatch, const Dim perInferBatch) = 0;
virtual ~SplitExecutor() = default;
};
@@ -45,7 +45,7 @@ private:
struct SplitOptimizedExecutor : public SplitExecutor {
public:
SplitOptimizedExecutor(BlockedMemoryDescCPtr inDesc, const std::vector<BlockedMemoryDescCPtr> &outDescs, const size_t axis);
void exec(const uint8_t* srcData, const std::vector<std::pair<size_t, uint8_t*>> &dstMemPtrs,
void exec(const uint8_t* srcData, const std::vector<uint8_t*>& dstRawMemPtrs,
const Dim origBatch, const Dim perInferBatch) override;
private:
@@ -56,11 +56,12 @@ private:
};
void optimizedNspc2Ncsp(size_t MB);
std::vector<uint8_t*> getRawDstMemPtrs() const;
bool canUseOptimizedNspc2Ncsp = false;
size_t axis = 1;
std::vector<std::pair<size_t, uint8_t*>> dstMemPtrs;
std::vector<std::pair<size_t, MemoryCPtr>> dstMemPtrs;
size_t INPUTS_NUM = 2;
};