[GPU] Prevent memory reset at runtime allocation for dynamic shape, fix wrong padding handling (#16351)

* Prevent memory reset at runtime allocation for dynamic shape

* Set default alloc to reset mem

* Additional fixes :
- If there is any convolution/deconvolution users which requires padded input, enqueue reset buffer when reuse buffer.
- Removed cl finish from gpu_buffer::fill. (Hopefully it should be waited only when needed. Otherwise sync is to be done by event)
- Removed buffer reset from on_execute of nonzero count, which is not needed any more.

* Remove unused API

* Fix tensor offset to project the padding

* Added unittest

* Applied review comment
This commit is contained in:
Taylor Yeonbok Lee 2023-03-24 13:10:33 -07:00 committed by GitHub
parent 1ef94ec069
commit 6a25143045
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 231 additions and 58 deletions

View File

@ -214,13 +214,6 @@ public:
bool is_primary_stream() const { return _is_primary_stream; }
bool is_dynamic() const { return _is_dynamic; }
/// Create memory object with specified @p layout and allocation @p type for primitive with @p id
/// Underlying memory handle can be reused with other primitives from memory pool based on @p dependencies
memory_ptr get_memory_from_pool(const layout& layout,
primitive_id id,
std::set<primitive_id> dependencies,
allocation_type type,
bool reusable = true);
memory_pool& get_memory_pool() {
return *_memory_pool;
}

View File

@ -106,13 +106,15 @@ public:
uint32_t network_id,
const std::set<primitive_id>& restrictions,
allocation_type type,
bool reusable = true); // get from pool or create memory allocation
bool reusable = true,
bool reset = true); // get from pool or create memory allocation
memory_ptr get_memory(const layout& layout, allocation_type type, bool reset = true);
memory_ptr get_from_non_padded_pool(const layout& layout,
const primitive_id& id,
uint32_t network_id,
const std::set<primitive_id>&,
allocation_type type);
allocation_type type,
bool reset = true);
memory_ptr get_from_padded_pool(const layout& layout,
const primitive_id& id,
uint32_t network_id,

View File

@ -42,6 +42,23 @@ public:
static std::string to_string(binary_convolution_node const& node);
typed_primitive_inst(network& network, binary_convolution_node const& node);
bool need_reset_input_memory() const override {
auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
if (input_layout.data_padding) {
return true;
}
return false;
}
bool need_reset_output_memory() const override {
bool res = parent::need_reset_output_memory();
auto output_layout = _impl_params->get_output_layout(0);
if (output_layout.data_padding) {
return true;
}
return res;
}
memory::ptr weights_memory() const { return dep_memory_ptr(1); }
};

View File

@ -127,6 +127,23 @@ public:
static layout calc_output_layout(convolution_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(convolution_node const& node);
bool need_reset_input_memory() const override {
auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
if (input_layout.data_padding) {
return true;
}
return false;
}
bool need_reset_output_memory() const override {
bool res = parent::need_reset_output_memory();
auto output_layout = _impl_params->get_output_layout(0);
if (output_layout.data_padding) {
return true;
}
return res;
}
public:
typed_primitive_inst(network& network, convolution_node const& node);

View File

@ -60,6 +60,23 @@ public:
static layout calc_output_layout(deconvolution_node const& node, kernel_impl_params const& impl_param);
static std::string to_string(deconvolution_node const& node);
bool need_reset_input_memory() const override {
auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
if (input_layout.data_padding) {
return true;
}
return false;
}
bool need_reset_output_memory() const override {
bool res = parent::need_reset_output_memory();
auto output_layout = _impl_params->get_output_layout(0);
if (output_layout.data_padding) {
return true;
}
return res;
}
typed_primitive_inst(network& network, deconvolution_node const& node);
memory::ptr weights_memory() const {

View File

@ -40,9 +40,6 @@ public:
static std::string to_string(count_nonzero_node const& node);
typed_primitive_inst(network& network, count_nonzero_node const& node);
private:
void on_execute() override;
};
using count_nonzero_inst = typed_primitive_inst<count_nonzero>;

View File

@ -192,7 +192,7 @@ public:
void allocate_internal_buffers();
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0);
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true);
std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
@ -284,7 +284,7 @@ protected:
size_t max_output_layout_size = 0;
std::vector<size_t> max_intermediates_memory_sizes;
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr);
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true);
memory::ptr allocate_internal_buffer(size_t idx);
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
@ -298,7 +298,7 @@ protected:
virtual event::ptr update_weights();
// if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false
bool update_impl();
void realloc_if_needed();
event::ptr realloc_if_needed();
cldnn::network::ptr get_unfused_subgraph();
@ -332,6 +332,21 @@ protected:
return { layout(in_layout.get<ShapeType>(), output_type, in_layout.format) };
}
virtual bool need_reset_input_memory() const {
return false;
}
virtual bool need_reset_output_memory() const {
std::vector<primitive_id> users;
for (auto u : _node->get_users())
users.push_back(u->id());
for (auto u : _network.get_primitives(users)) {
if (u->need_reset_input_memory())
return true;
}
return false;
}
// This could be implemented via single map std::unordered_map<instrumentation::perf_counter_key, std::tuple<int64_t, size_t>>
// but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key

View File

@ -1322,16 +1322,6 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
}
}
memory::ptr network::get_memory_from_pool(const layout& layout,
primitive_id id,
std::set<primitive_id> dependencies,
allocation_type type,
bool reusable) {
if (_config.get_property(ov::intel_gpu::enable_memory_pool))
return _memory_pool->get_memory(layout, id, get_id(), dependencies, type, reusable);
return _memory_pool->get_memory(layout, type);
}
network::VariableState& network::get_variable_memory(const std::string &variable_id) {
auto it = _variables_states.find(variable_id);
if (it == _variables_states.end()) {

View File

@ -48,10 +48,6 @@ std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {
count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {}
void count_nonzero_inst::on_execute() {
output_memory().fill(_network.get_stream(), 0);
}
// -----------------------------------------------
// gather_nonzero
// -----------------------------------------------

View File

@ -242,11 +242,11 @@ void primitive_inst::update_shape() {
}
}
void primitive_inst::realloc_if_needed() {
event::ptr primitive_inst::realloc_if_needed() {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
event::ptr ev = nullptr;
// Update param if fake_alignment is available
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
auto actual_layout = updated_params.get_output_layout();
@ -254,28 +254,31 @@ void primitive_inst::realloc_if_needed() {
// input_layout node is supposed to always use external memory in dynamic case
if (_node->is_type<input_layout>())
return;
return ev;
bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;
if (can_reuse_buffer) {
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
if (need_reset_output_memory()) {
ev = _outputs[0]->fill(_network.get_stream());
}
} else {
GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
<< " Current buffer_size=" << max_output_layout_size
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
_outputs = allocate_outputs(&updated_params);
_outputs = allocate_outputs(&updated_params, need_reset_output_memory());
// TODO : need to handle multiple outputs
max_output_layout_size = updated_params.output_layouts[0].count();
}
// intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
{
if (_impl == nullptr)
return;
return ev;
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
if (ibuf_layouts.empty())
return;
return ev;
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
@ -293,6 +296,7 @@ void primitive_inst::realloc_if_needed() {
}
}
}
return ev;
}
bool primitive_inst::update_impl() {
@ -431,7 +435,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
auto ev = update_weights();
if (ev)
dependencies.push_back(ev);
realloc_if_needed();
auto ev_reset = realloc_if_needed();
if (ev_reset)
dependencies.push_back(ev_reset);
}
}
}
@ -763,15 +769,15 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
}
memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
uint32_t net_id, bool is_internal, size_t idx) {
uint32_t net_id, bool is_internal, size_t idx, bool reset) {
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
allocation_type type, bool reusable) {
allocation_type type, bool reusable, bool reset = true) {
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
// Use layout with max tensor for dynamic shape with upper bound
auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable);
return pool.get_memory(static_layout, type);
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
return pool.get_memory(static_layout, type, reset);
};
@ -817,7 +823,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
_node.id(),
_node.get_memory_dependencies(),
alloc_type,
false);
false,
reset);
} else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
_engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
@ -829,23 +836,24 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
return _engine.allocate_memory(layout, alloc_type, false);
} else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
return _engine.allocate_memory(layout, alloc_type);
return _engine.allocate_memory(layout, alloc_type, reset);
} else {
return get_memory_from_pool(_engine,
layout,
_node.id(),
_node.get_memory_dependencies(),
alloc_type,
true);
true,
reset);
}
}
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params) {
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem) {
std::vector<memory::ptr> outputs;
for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
*_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
get_network_id(), _network.is_internal(), i));
get_network_id(), _network.is_internal(), i, reset_mem));
}
return outputs;
}

View File

@ -245,7 +245,6 @@ public:
JitDefinitions GetDefinitions(const Tensor::TensorBaseT<DType, Layout>& t) const {
JitDefinitions definitions{
{_name + "_OFFSET", toCodeString(t.GetFirstElementOffset())},
{_name + "_VIEW_OFFSET", toCodeString(t.GetViewOffset())},
{_name + "_LENGTH", toCodeString(t.LogicalSize())},
{_name + "_DIMS", toCodeString(t.GetDims().size())},
@ -258,6 +257,7 @@ public:
definitions.insert(definitions.end(), type_defs.begin(), type_defs.end());
if (!t.is_dynamic()) {
definitions.push_back({_name + "_OFFSET", toCodeString(t.GetFirstElementOffset())});
definitions.push_back({_name + "_SIZE", toCodeString(t.GetDims().size())});
definitions.push_back(
{_name + "_SIZES_DATA",
@ -265,13 +265,34 @@ public:
definitions.push_back(
{_name + "_PITCHES",
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.pitch; })});
} else {
// calculate tensor offset
std::vector<std::string> padded_pitches = {
toVectorMulString({_name + "_X_PITCH", _name + "_PAD_BEFORE_SIZE_X"}),
toVectorMulString({_name + "_Y_PITCH", _name + "_PAD_BEFORE_SIZE_Y"}),
toVectorMulString({_name + "_Z_PITCH", _name + "_PAD_BEFORE_SIZE_Z"}),
toVectorMulString({_name + "_W_PITCH", _name + "_PAD_BEFORE_SIZE_W"}),
toVectorMulString({_name + "_FEATURE_PITCH", _name + "_PAD_BEFORE_FEATURE_NUM"}),
toVectorMulString({_name + "_BATCH_PITCH", _name + "_PAD_BEFORE_BATCH_NUM"})};
std::string offset_str = "(";
for (size_t i = 0; i < padded_pitches.size(); ++i) {
offset_str += padded_pitches[i];
if (i < padded_pitches.size() - 1)
offset_str += " + ";
}
offset_str += ")";
definitions.push_back({_name + "_OFFSET", offset_str});
}
definitions.push_back(
{_name + "_PAD_BEFORE",
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.before; })});
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) {
return d.pad.before;
})});
definitions.push_back(
{_name + "_PAD_AFTER",
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.after; })});
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) {
return d.pad.after;
})});
return definitions;
}

View File

@ -120,7 +120,8 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
const primitive_id& id,
uint32_t network_id,
const std::set<primitive_id>& restrictions,
allocation_type type) {
allocation_type type,
bool reset) {
auto it = _non_padded_pool.lower_bound(layout.bytes_count());
while (it != _non_padded_pool.end()) {
if (it->second._network_id == network_id &&
@ -139,7 +140,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
}
GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
// didn't find anything for you? create new resource
auto mem = alloc_memory(layout, type);
auto mem = alloc_memory(layout, type, reset);
{
_non_padded_pool.emplace(layout.bytes_count(),
memory_record({{id, network_id}}, mem, network_id, type));
@ -221,21 +222,22 @@ memory::ptr memory_pool::get_memory(const layout& layout,
uint32_t network_id,
const std::set<primitive_id>& restrictions,
allocation_type type,
bool reusable_across_network) {
bool reusable_across_network,
bool reset) {
if (reusable_across_network) {
// reusable within the same network
if (!layout.format.is_image() && layout.data_padding == padding{{0, 0, 0, 0}, 0}) {
// non-padded buffers
return get_from_non_padded_pool(layout, id, network_id, restrictions, type);
return get_from_non_padded_pool(layout, id, network_id, restrictions, type, reset);
} else if (!layout.format.is_image()) {
// padded buffers
return get_from_padded_pool(layout, id, network_id, restrictions, type);
} else {
// images (reuse not yet implemented)
return alloc_memory(layout, type);
return alloc_memory(layout, type, reset);
}
} else {
return alloc_memory(layout, type);
return alloc_memory(layout, type, reset);
}
}

View File

@ -151,7 +151,7 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
}
if (reset || res->is_memory_reset_needed(layout)) {
res->fill(get_service_stream());
get_service_stream().wait_for_events({res->fill(get_service_stream())});
}
return res;

View File

@ -73,9 +73,6 @@ event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) {
cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
cl_stream.get_cl_queue().enqueueFillBuffer<unsigned char>(_buffer, pattern, 0, size(), nullptr, &ev_ocl);
// TODO: do we need sync here?
cl_stream.finish();
return ev;
}

View File

@ -6,6 +6,7 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/softmax.hpp>
#include <intel_gpu/primitives/reorder.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "softmax_inst.h"
@ -19,6 +20,106 @@ using namespace cldnn;
using namespace ::tests;
namespace memory_realloc_tests {
TEST(memory_reuse_realloc_reset_test, basic_conv_with_padding) {
auto& engine = get_test_engine();
layout weight_layout = layout{ov::PartialShape{1, 3, 3, 3}, data_types::f16, format::bfyx};
auto weights = engine.allocate_memory(weight_layout);
set_values<FLOAT16>(weights, {
1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f,
//
2.0f, 2.0f, 2.0f,
2.0f, 2.0f, 2.0f,
2.0f, 2.0f, 2.0f,
//
3.0f, 3.0f, 3.0f,
3.0f, 3.0f, 3.0f,
3.0f, 3.0f, 3.0f,
});
layout input_layout_1 = layout{ov::PartialShape{1, 3, 5, 5}, data_types::f32, format::bfyx};
auto input_mem_1 = engine.allocate_memory(input_layout_1);
set_values(input_mem_1, {
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
//
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
//
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
});
std::vector<float> ref_output_1 = {6, 18, 36, 54, 72, 54, 30, 12, 36, 72, 108, 144, 108,
60, 18, 54, 108, 162, 216, 162, 90, 18, 54, 108, 162, 216,
162, 90, 18, 54, 108, 162, 216, 162, 90, 12, 36, 72, 108,
144, 108, 60, 6, 18, 36, 54, 72, 54, 30};
layout input_layout_2 = layout{ov::PartialShape{1, 3, 2, 2}, data_types::f32, format::bfyx};
auto input_mem_2 = engine.allocate_memory(input_layout_2);
set_values(input_mem_2, {11.0f, 11.0f, 11.0f, 11.0f,
11.0f, 11.0f, 11.0f, 11.0f,
11.0f, 11.0f, 11.0f, 11.0f});
std::vector<float> ref_output_2 = { 66, 132, 132, 66, 132, 264, 264, 132, 132, 264, 264, 132, 66, 132, 132, 66};
std::vector<float> values_to_subtract = {};
auto input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
topology topology(input_layout("input", input_l),
data("weights", weights),
reorder("reorder", input_info("input"), format::bfyx, data_types::f16,
values_to_subtract, reorder_mean_mode::subtract, padding{{0, 0, 2, 2}, 0}),
convolution("conv",
input_info("reorder"),
{"weights"},
{}, /*bias*/
{1, 1}, /*stride*/
{2, 2}, /*pad*/
{1, 1}, /*dilation*/
{2, 2}, /*pad_above*/
{2, 2}, /*pad_below*/
padding{{0, 0, 0, 0}, 0}),
reorder("output", input_info("conv"), format::bfyx, data_types::f32)); /*output padding*/
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input_mem_1);
auto outputs_1 = network.execute();
network.set_input_data("input", input_mem_2);
auto outputs_2 = network.execute();
auto output_mem_2 = outputs_2.begin()->second.get_memory();
cldnn::mem_lock<float> output_mem_2_ptr(output_mem_2, get_test_stream());
for (size_t i = 0; i < output_mem_2->get_layout().get_buffer_size().count(); ++i) {
ASSERT_EQ(output_mem_2_ptr[i], ref_output_2[i]);
}
// check padding of second run of reorder
// 0, 0, 0, 0, 0, 0,
// 0, 0, 0, 0, 0, 0,
// 0, 0, 11, 11, 0, 0,
// 0, 0, 11, 11, 0, 0,
// 0, 0,"0","0","0","0", // !! check pad_after
// 0, 0,"0","0","0","0", // !! check pad_after
auto reorder_mem = network.get_primitive("reorder")->output_memory_ptr();
cldnn::mem_lock<FLOAT16, mem_lock_type::read> reorder_mem_ptr(reorder_mem, get_test_stream());
for (size_t i = 26; i < 29; ++i) {
ASSERT_EQ((float)reorder_mem_ptr[i], 0.f);
}
for (size_t i = 32; i < 35; ++i) {
ASSERT_EQ((float)reorder_mem_ptr[i], 0.f);
}
}
TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
static const int32_t