[CPU] optimize TensorIterator DynamicBuffer (#15163)
* optimize TensorIterator DynamicBuffer by preallocating a large chunk of intermediate buffer. code clean. review update: always copy in transfer as it is not worthy. review update: update mem_holder_buffer as dnnl::memory instead of shared_ptr of it. review update: reuse mem_buffer_holder even if the shape changes. review update: growth factor. review update: bug fix. * fix code style * review update: rewrite the dynamic buffer using the cpu Memory class, instead of dnnl::memory * Update src/plugins/intel_cpu/src/nodes/tensoriterator.cpp Co-authored-by: Maksim Kutakov <maxim.kutakov@gmail.com> * Update src/plugins/intel_cpu/src/nodes/tensoriterator.cpp Co-authored-by: Maksim Kutakov <maxim.kutakov@gmail.com> * review update: minor fix --------- Co-authored-by: Maksim Kutakov <maxim.kutakov@gmail.com>
This commit is contained in:
parent
8153664b52
commit
86b50044cd
@ -231,88 +231,141 @@ DynamicBuffer::DynamicBuffer(const MemoryPtr &from_, const std::vector<MemoryPtr
|
||||
}
|
||||
|
||||
void DynamicBuffer::execute(const dnnl::engine& eng, const int iter) {
|
||||
if (from->getStaticDims()[map_rule.axis] != std::abs(map_rule.stride))
|
||||
IE_THROW() << "TensorIterator (Loop) has incorrect output shape[axis] after iteration for concatenation. " << std::abs(map_rule.stride) <<
|
||||
" is expected, but actual: " << from->getStaticDims()[map_rule.axis];
|
||||
|
||||
if (iter == 0) {
|
||||
init(eng);
|
||||
return;
|
||||
}
|
||||
|
||||
auto new_buffer = create_buffer(eng);
|
||||
move_buffer(new_buffer);
|
||||
// if chunk_offset_in_byte out of range of buffer holder, reallocate a larger chunk
|
||||
if (check_buffer()) {
|
||||
auto new_buffer = create_buffer(eng);
|
||||
move_buffer(new_buffer);
|
||||
}
|
||||
|
||||
move_data();
|
||||
}
|
||||
|
||||
void DynamicBuffer::init(const dnnl::engine& eng) {
|
||||
chunk_offset_in_byte = 0;
|
||||
buffer_offset_in_byte = 0;
|
||||
void DynamicBuffer::reset(int max_iter_count_) {
|
||||
max_iter_count = max_iter_count_;
|
||||
}
|
||||
|
||||
const auto axis = map_rule.axis;
|
||||
void DynamicBuffer::init(const dnnl::engine& eng) {
|
||||
const auto stride = map_rule.stride;
|
||||
const auto abs_stride = std::abs(stride);
|
||||
|
||||
auto src_mem = from->GetPrimitive();
|
||||
auto src_desc = src_mem.get_desc();
|
||||
auto dims = src_desc.dims();
|
||||
|
||||
if (dims[axis] != abs_stride)
|
||||
IE_THROW() << "TensorIterator (Loop) has incorrect output shape[axis] after iteration for concatenation. " << abs_stride <<
|
||||
" is expected, but actual: " << dims[axis];
|
||||
|
||||
// We have no idea of "from" node memory dims until the sub_graph has been executed.
|
||||
const auto& src_mem = from->GetPrimitive();
|
||||
const auto& src_desc = src_mem.get_desc();
|
||||
const auto& dims = src_desc.dims();
|
||||
count = std::accumulate(dims.begin(), dims.begin() + map_rule.axis, size_t(1), std::multiplies<size_t>());
|
||||
len = std::accumulate(dims.begin() + map_rule.axis + 1, dims.end(), elem_size, std::multiplies<size_t>());
|
||||
mem_holder_buffer = memory(src_desc, eng);
|
||||
copy(reinterpret_cast<const uint8_t*>(from->GetPtr()), get_ptr(mem_holder_buffer), 0, 0, 1, from->GetSize());
|
||||
}
|
||||
chunk_unit_in_byte = abs_stride * len;
|
||||
|
||||
dnnl::memory DynamicBuffer::create_buffer(const dnnl::engine& eng) {
|
||||
const auto axis = map_rule.axis;
|
||||
const auto stride = map_rule.stride;
|
||||
const auto abs_stride = std::abs(stride);
|
||||
|
||||
const auto old_desc = mem_holder_buffer.get_desc();
|
||||
auto dims = old_desc.dims();
|
||||
|
||||
if (from->getStaticDims()[axis] != abs_stride)
|
||||
IE_THROW() << "TensorIterator (Loop) has incorrect output shape[axis] after iteration for concatenation. " << abs_stride <<
|
||||
" is expected, but actual: " << from->getStaticDims()[axis];
|
||||
|
||||
dims[axis] += abs_stride;
|
||||
dnnl::memory::desc new_buffer_desc(dims, old_desc.data_type(), DnnlExtensionUtils::GetPlainFormatByRank(dims.size()));
|
||||
|
||||
if (stride > 0.0f) {
|
||||
chunk_offset_in_byte += new_buffer_desc.data.format_desc.blocking.strides[axis] * elem_size * abs_stride;
|
||||
} else {
|
||||
buffer_offset_in_byte = from->GetPrimitive().get_desc().data.format_desc.blocking.strides[axis] * elem_size * abs_stride;
|
||||
if (!mem_holder_buffer) { // else reuse buffer holder of last inference
|
||||
// preallocate a large chunk of memory to hold intermediate concated outputs of all iterations.
|
||||
mem_holder_buffer = create_buffer(eng);
|
||||
}
|
||||
|
||||
return dnnl::memory(new_buffer_desc, eng);
|
||||
// reset chunk_offset_in_byte since the first execution
|
||||
chunk_stride_in_byte = mem_holder_buffer->GetSize() / count;
|
||||
chunk_offset_in_byte = stride > 0 ? 0 : (chunk_stride_in_byte - chunk_unit_in_byte);
|
||||
num_execs = 0;
|
||||
}
|
||||
|
||||
void DynamicBuffer::move_buffer(dnnl::memory new_buffer) {
|
||||
const auto axis = map_rule.axis;
|
||||
const auto src_stride = mem_holder_buffer.get_desc().dims()[axis] * len;
|
||||
const auto dst_stride = new_buffer.get_desc().dims()[axis] * len;
|
||||
bool DynamicBuffer::check_buffer() {
|
||||
if (map_rule.stride > 0) {
|
||||
if (chunk_offset_in_byte + chunk_unit_in_byte > chunk_stride_in_byte) return true;
|
||||
} else {
|
||||
if (chunk_offset_in_byte < 0) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
copy(get_ptr(mem_holder_buffer), get_ptr(new_buffer) + buffer_offset_in_byte,
|
||||
src_stride, dst_stride, count, src_stride);
|
||||
MemoryPtr DynamicBuffer::create_buffer(const dnnl::engine& eng) {
|
||||
const auto abs_stride = std::abs(map_rule.stride);
|
||||
|
||||
const auto estimate_iters = [&] () {
|
||||
if (max_iter_count != -1) return max_iter_count;
|
||||
|
||||
// in case of no idea of memory upper boundary
|
||||
return (num_execs == 0) ? 1 : 2 * num_execs; // growth factor 2
|
||||
};
|
||||
const auto estimated_iters = estimate_iters();
|
||||
const Shape _shape = Shape({count, static_cast<size_t>(abs_stride * estimated_iters), len/elem_size});
|
||||
auto _descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp);
|
||||
auto new_buffer_desc = _descCreator->createSharedDesc(from->getDesc().getPrecision(), _shape);
|
||||
|
||||
auto _ptr = std::make_shared<Memory>(eng);
|
||||
_ptr->Create(*new_buffer_desc);
|
||||
return _ptr;
|
||||
}
|
||||
|
||||
void DynamicBuffer::move_buffer(const MemoryPtr& new_buffer) {
|
||||
const auto stride = map_rule.stride;
|
||||
|
||||
// copy data from old buffer to new buffer
|
||||
const auto src_stride = chunk_stride_in_byte;
|
||||
const auto dst_stride = new_buffer->getStaticDims()[1] * len;
|
||||
|
||||
const auto valid_size = chunk_unit_in_byte * num_execs;
|
||||
const auto src_offset_in_byte = stride > 0 ? 0 : (src_stride - valid_size);
|
||||
chunk_offset_in_byte = stride > 0 ? 0 : (dst_stride - valid_size); // reset chunk_offset_in_byte
|
||||
|
||||
copy(reinterpret_cast<uint8_t*>(mem_holder_buffer->GetPtr()) + src_offset_in_byte, reinterpret_cast<uint8_t*>(new_buffer->GetPtr()) + chunk_offset_in_byte,
|
||||
src_stride, dst_stride, count, valid_size);
|
||||
|
||||
// assign mem_holder_buffer
|
||||
mem_holder_buffer = new_buffer;
|
||||
chunk_stride_in_byte = mem_holder_buffer->GetSize() / count;
|
||||
|
||||
// adjust for next execution
|
||||
if (stride > 0) {
|
||||
chunk_offset_in_byte += valid_size;
|
||||
} else {
|
||||
chunk_offset_in_byte -= chunk_unit_in_byte;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicBuffer::move_data() {
|
||||
const auto axis = map_rule.axis;
|
||||
const auto src_stride = abs(map_rule.stride) * len;
|
||||
const auto dst_stride = mem_holder_buffer.get_desc().dims()[axis] * len;
|
||||
const auto dst_stride = chunk_stride_in_byte;
|
||||
|
||||
copy(reinterpret_cast<const uint8_t*>(from->GetPtr()), get_ptr(mem_holder_buffer) + chunk_offset_in_byte,
|
||||
src_stride, dst_stride, count, src_stride);
|
||||
copy(reinterpret_cast<const uint8_t*>(from->GetPtr()), reinterpret_cast<uint8_t*>(mem_holder_buffer->GetPtr()) + chunk_offset_in_byte,
|
||||
src_stride, dst_stride, count, chunk_unit_in_byte);
|
||||
|
||||
// adjust for next execution
|
||||
num_execs++;
|
||||
if (map_rule.stride > 0) {
|
||||
chunk_offset_in_byte += chunk_unit_in_byte;
|
||||
} else {
|
||||
chunk_offset_in_byte -= chunk_unit_in_byte;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicBuffer::transfer(const Node* node) {
|
||||
if (mem_holder_buffer) {
|
||||
if (mem_holder_buffer && num_execs > 0) {
|
||||
const auto axis = map_rule.axis;
|
||||
const auto stride = map_rule.stride;
|
||||
const auto abs_stride = std::abs(stride);
|
||||
|
||||
const auto& src_mem = from->GetPrimitive();
|
||||
const auto& src_desc = src_mem.get_desc();
|
||||
auto dims = src_desc.dims();
|
||||
dims[axis] = abs_stride * num_execs;
|
||||
const auto desc = node->getBaseMemDescAtOutputPort(map_rule.from)->cloneWithNewDims(
|
||||
DnnlExtensionUtils::convertToVectorDims(mem_holder_buffer.get_desc().dims()));
|
||||
DnnlExtensionUtils::convertToVectorDims(dims));
|
||||
|
||||
redefineToMemories(to, desc);
|
||||
|
||||
copy(get_ptr(mem_holder_buffer), reinterpret_cast<uint8_t*>(to.front()->GetPtr()), 0, 0, 1, to.front()->GetSize());
|
||||
const auto src_stride = chunk_stride_in_byte;
|
||||
const auto dst_stride = to.front()->getStaticDims()[axis] * len;
|
||||
const auto valid_size = chunk_unit_in_byte * num_execs;
|
||||
const auto src_offset_in_byte = stride > 0 ? 0 : (src_stride - valid_size);
|
||||
copy(reinterpret_cast<uint8_t*>(mem_holder_buffer->GetPtr()) + src_offset_in_byte, reinterpret_cast<uint8_t*>(to.front()->GetPtr()),
|
||||
src_stride, dst_stride, count, dst_stride);
|
||||
} else {
|
||||
VectorDims newDims = to.front()->GetShape().getDims();
|
||||
nullifyUndefinedDims(newDims);
|
||||
@ -320,8 +373,6 @@ void DynamicBuffer::transfer(const Node* node) {
|
||||
const auto desc = node->getBaseMemDescAtOutputPort(map_rule.from)->cloneWithNewDims(newDims);
|
||||
redefineToMemories(to, desc);
|
||||
}
|
||||
|
||||
mem_holder_buffer.reset(nullptr);
|
||||
}
|
||||
|
||||
void DynamicBuffer::copy(const uint8_t* src, uint8_t* dst, const size_t src_stride, const size_t dst_stride, const size_t count, const size_t len) {
|
||||
@ -330,13 +381,6 @@ void DynamicBuffer::copy(const uint8_t* src, uint8_t* dst, const size_t src_stri
|
||||
});
|
||||
}
|
||||
|
||||
uint8_t* DynamicBuffer::get_ptr(dnnl::memory& prim) {
|
||||
auto ptr = static_cast<uint8_t*>(prim.get_data_handle());
|
||||
auto md = prim.get_desc().data;
|
||||
dnnl::impl::memory_desc_wrapper wrapper(md);
|
||||
return ptr + wrapper.offset0() * wrapper.data_type_size();
|
||||
}
|
||||
|
||||
bool TensorIterator::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
|
||||
try {
|
||||
if (!one_of(op->get_type_info(),
|
||||
@ -517,6 +561,10 @@ void TensorIterator::prepareParams() {
|
||||
prepareOutputPorts();
|
||||
prepareBackEdges();
|
||||
}
|
||||
|
||||
// reset local states of DynamicBuffer
|
||||
for (auto& buffer : buffers)
|
||||
buffer->reset(lastUsedTripCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -65,33 +65,40 @@ protected:
|
||||
class DynamicBuffer {
|
||||
public:
|
||||
DynamicBuffer(const MemoryPtr &from_, const std::vector<MemoryPtr> &to_, const PortMap &map_rule_);
|
||||
~DynamicBuffer() = default;
|
||||
|
||||
void execute(const dnnl::engine& eng, const int iter);
|
||||
void transfer(const Node* node);
|
||||
|
||||
void reset(int max_iter_count_); // reset local
|
||||
|
||||
private:
|
||||
void init(const dnnl::engine& eng);
|
||||
|
||||
/* methods for resize and refill buffer */
|
||||
dnnl::memory create_buffer(const dnnl::engine& eng);
|
||||
void move_buffer(dnnl::memory new_buffer);
|
||||
bool check_buffer();
|
||||
MemoryPtr create_buffer(const dnnl::engine& eng);
|
||||
void move_buffer(const MemoryPtr& new_buffer);
|
||||
void move_data();
|
||||
|
||||
static void copy(const uint8_t* src, uint8_t* dst, const size_t src_stride, const size_t dst_stride, const size_t count, const size_t len);
|
||||
static uint8_t* get_ptr(dnnl::memory& prim);
|
||||
|
||||
/* variable states */
|
||||
size_t len = 1lu;
|
||||
size_t count = 1lu;
|
||||
size_t elem_size = 0lu;
|
||||
ptrdiff_t chunk_offset_in_byte = 0;
|
||||
ptrdiff_t buffer_offset_in_byte = 0;
|
||||
|
||||
ptrdiff_t chunk_stride_in_byte = 0;
|
||||
ptrdiff_t chunk_offset_in_byte = 0;
|
||||
size_t chunk_unit_in_byte = 0lu; // the amount of bytes copied per each count per each execution (iteration)
|
||||
int num_execs = 0lu; // number of executions happened
|
||||
int max_iter_count = -1; // estimated maximum iter count
|
||||
|
||||
/* invariable states */
|
||||
MemoryPtr from;
|
||||
std::vector<MemoryPtr> to;
|
||||
PortMap map_rule;
|
||||
size_t elem_size = 0lu;
|
||||
|
||||
dnnl::memory mem_holder_buffer;
|
||||
MemoryPtr mem_holder_buffer;
|
||||
};
|
||||
|
||||
class TensorIterator : public Node {
|
||||
|
Loading…
Reference in New Issue
Block a user