[GPU] Updated to allocate memory in order of size while deserializing (#16867)

* updated to allocate memory in order of size while deserializing

* fix windows build error

* updated to check dependencies between not connected nodes
This commit is contained in:
Eddy Kim 2023-04-17 14:33:57 +09:00 committed by GitHub
parent 175db3523a
commit 9b9c31d46b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 528 additions and 415 deletions

View File

@ -92,7 +92,7 @@ void data_inst::load(BinaryInputBuffer& ib) {
pos += data_size;
ib.seekg(pos);
} else {
_outputs[0] = get_network().get_memory_pool().get_memory(output_layout, _allocation_type, false);
_outputs[0] = get_network().get_engine().allocate_memory(output_layout, _allocation_type, false);
if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
ib >> make_data(_outputs[0]->buffer_ptr(), data_size);

View File

@ -168,6 +168,14 @@ void oooq_memory_dependencies::run(program& p) {
if (!are_connected(A, B)) {
add_memory_dependency(*itr_A, *itr_B);
add_memory_dependency(*itr_B, *itr_A);
} else {
for (auto u : (*itr_A)->get_users()) {
if (u != (*itr_B) && !are_connected(B, user_map[u]) && !are_connected(user_map[u], B)) {
add_memory_dependency(*itr_A, *itr_B);
add_memory_dependency(*itr_B, *itr_A);
break;
}
}
}
itr_B++;
B++;

View File

@ -392,29 +392,34 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st
_primitives[_primitive_id] = new_primitive_inst;
}
std::vector<std::shared_ptr<primitive_inst>> insts_to_allocate;
size_t exec_order_size;
ib >> exec_order_size;
_exec_order.clear();
std::vector<std::string> _exec_order_types;
_exec_order_types.resize(exec_order_size);
for (auto& type : _exec_order_types) {
for (size_t i = 0; i < exec_order_size; ++i) {
std::string type;
ib >> type;
std::shared_ptr<cldnn::primitive_inst> new_primitive_inst = prim_map_storage::instance().get_type_id(type)->create_instance(*this);
_exec_order.emplace_back(new_primitive_inst);
insts_to_allocate.emplace_back(new_primitive_inst);
}
_outputs.clear();
_output_chains.clear();
for (const auto& p_inst : _exec_order) {
for (const auto& p_inst : insts_to_allocate) {
ib >> *p_inst;
_primitives[p_inst->id()] = p_inst;
if (p_inst->get_impl() != nullptr)
p_inst->init_by_cached_kernels(kernels_cache);
}
std::vector<primitive_id> exec_order_ids;
ib >> exec_order_ids;
_exec_order.clear();
for (auto& exec_order_id : exec_order_ids) {
_exec_order.emplace_back(_primitives[exec_order_id]);
}
for (auto& item : _primitives) {
auto& p_inst = item.second;
if (p_inst->is_input())
@ -552,14 +557,35 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
size_t exec_order_size = _exec_order.size();
ob << exec_order_size;
std::unordered_map<primitive_id, size_t> exec_order_num;
size_t i = exec_order_size;
for (const auto& p_inst : _exec_order) {
exec_order_num[p_inst->id()] = --i;
}
std::vector<std::shared_ptr<primitive_inst>> insts_to_allocate(_exec_order.begin(), _exec_order.end());
std::sort(insts_to_allocate.begin(),
insts_to_allocate.end(),
[&exec_order_num, &exec_order_size](std::shared_ptr<primitive_inst> const& lhs, std::shared_ptr<primitive_inst> const& rhs) {
size_t lhs_size = (lhs->mem_allocated()) ? (lhs->get_output_layout().bytes_count() + exec_order_size) : exec_order_num[lhs->id()];
size_t rhs_size = (rhs->mem_allocated()) ? (rhs->get_output_layout().bytes_count() + exec_order_size) : exec_order_num[rhs->id()];
return (lhs_size > rhs_size);
});
for (const auto& p_inst : insts_to_allocate) {
ob << p_inst->get_node().get_primitive()->type_string();
}
for (const auto& p_inst : _exec_order) {
for (const auto& p_inst : insts_to_allocate) {
ob << *p_inst;
}
std::vector<primitive_id> exec_order_ids;
for (const auto& p_inst : _exec_order) {
exec_order_ids.emplace_back(p_inst->id());
}
ob << exec_order_ids;
std::map<std::string, std::string> reuse_map;
auto& po = _program->get_processing_order();

View File

@ -54,7 +54,10 @@ TEST(memory_tests, DISABLED_network_creation_loop)
}
}
#endif
TEST(memory_pool, basic_non_padded_relu_pipe) {
namespace {
class memory_pool: public ::testing::Test {
public:
void test_basic_non_padded_relu_pipe(bool is_caching_test) {
// We need a new engine here to get correct get_max_used_device_memory() result
// If we reuse common engine, then max memory value will be taken from some previously executed tests
// as it's tracked within engine instance
@ -80,14 +83,14 @@ TEST(memory_pool, basic_non_padded_relu_pipe) {
ExecutionConfig config = get_test_default_config(*engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(*engine, topology, config);
network.set_input_data("input", input);
auto outputs = network.execute();
network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input);
auto outputs = network->execute();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)64);
}
TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
void test_basic_non_padded_relu_and_pooling_pipe(bool is_caching_test) {
// We need a new engine here to get correct get_max_used_device_memory() result
// If we reuse common engine, then max memory value will be taken from some previously executed tests
// as it's tracked within engine instance
@ -112,14 +115,14 @@ TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
ExecutionConfig config = get_test_default_config(*engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(*engine, topology, config);
network.set_input_data("input", input);
auto outputs = network.execute();
network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input);
auto outputs = network->execute();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)896);
}
}
TEST(memory_pool, multi_outputs_network) {
void test_multi_outputs_network(bool is_caching_test) {
// -- relu -- relu1 -- relu4
// input<
// -- relu2 -- relu3 -- relu5--relu6--relu7
@ -147,14 +150,14 @@ TEST(memory_pool, multi_outputs_network) {
ExecutionConfig config = get_test_default_config(*engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(*engine, topology, config);
network.set_input_data("input", input);
auto outputs = network.execute();
network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input);
auto outputs = network->execute();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 1536);
}
}
TEST(memory_pool, oooq) {
void test_oooq(bool is_caching_test) {
/* -- relu1 - concat1- relu4 --
input< -- relu2 / >-- concat2 -- relu6
-- relu3 -- relu5 ---------
@ -185,14 +188,14 @@ TEST(memory_pool, oooq) {
ExecutionConfig config = get_test_default_config(*engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(*engine, topology, config);
network.set_input_data("input", input);
auto outputs = network.execute();
network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input", input);
auto outputs = network->execute();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560);
}
}
TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
void test_shared_mem_pool_same_topology_twice() {
/* -- relu1 - concat1- relu4 --
input< -- relu2 | >-- concat2 -- relu6
-- relu3 -- relu5 ---------
@ -272,9 +275,9 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
}
}
}
}
}
TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) {
void test_shared_mem_pool_same_topology_twice_weights() {
// We need a new engine here to get correct get_max_used_device_memory() result
// If we reuse common engine, then max memory value will be taken from some previously executed tests
// as it's tracked within engine instance
@ -353,9 +356,9 @@ TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) {
}
}
}
}
}
TEST(memory_pool, shared_mem_pool_diff_batches) {
void test_shared_mem_pool_diff_batches(bool is_caching_test) {
// We need a new engine here to get correct get_max_used_device_memory() result
// If we reuse common engine, then max memory value will be taken from some previously executed tests
// as it's tracked within engine instance
@ -391,22 +394,22 @@ TEST(memory_pool, shared_mem_pool_diff_batches) {
ExecutionConfig config = get_test_default_config(*engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network_first(*engine, topo, config);
network_first.set_input_data("input", input_8);
auto outputs = network_first.execute();
network::ptr network_first = get_network(*engine, topo, config, get_test_stream_ptr(), is_caching_test);
network_first->set_input_data("input", input_8);
auto outputs = network_first->execute();
auto dev_info = engine->get_device_info();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)4744);
topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1
network network_second(*engine, topo, config);
network_second.set_input_data("input", input_1);
auto outputs_second = network_second.execute();
network::ptr network_second = get_network(*engine, topo, config, get_test_stream_ptr(), is_caching_test);
network_second->set_input_data("input", input_1);
auto outputs_second = network_second->execute();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)5912);
}
}
TEST(memory_pool, shared_dep_two_output) {
void test_shared_dep_two_output(bool is_caching_test) {
// We need a new engine here to get correct get_max_used_device_memory() result
// If we reuse common engine, then max memory value will be taken from some previously executed tests
// as it's tracked within engine instance
@ -424,12 +427,12 @@ TEST(memory_pool, shared_dep_two_output) {
ExecutionConfig config = get_test_default_config(*engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(*engine, topo, config);
auto outputs = network.execute();
network::ptr network = get_network(*engine, topo, config, get_test_stream_ptr(), is_caching_test);
auto outputs = network->execute();
ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)192);
}
}
TEST(memory_pool, non_opt_intermidate_opt_after) {
void test_non_opt_intermidate_opt_after(bool is_caching_test) {
auto& engine = get_test_engine();
auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 });
auto input_layout2 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 });
@ -465,10 +468,10 @@ TEST(memory_pool, non_opt_intermidate_opt_after) {
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(false));
network network(engine, topology, config);
network.set_input_data("input1", input_memory1);
network.set_input_data("input2", input_memory2);
auto outputs = network.execute();
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input1", input_memory1);
network->set_input_data("input2", input_memory2);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), static_cast<size_t>(2));
auto out1 = outputs.at("elt1");
@ -478,9 +481,9 @@ TEST(memory_pool, non_opt_intermidate_opt_after) {
cldnn::mem_lock<float> out2_ptr(out2.get_memory(), get_test_stream());
ASSERT_EQ(out1_ptr[0], 1.0f);
ASSERT_EQ(out2_ptr[0], 2.0f);
}
}
TEST(memory_pool, add_mem_dep_test) {
void test_add_mem_dep(bool is_caching_test) {
auto& engine = get_test_engine();
auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 2, 2, 2 });
@ -514,9 +517,9 @@ TEST(memory_pool, add_mem_dep_test) {
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
network network(engine, topology, config);
network.set_input_data("input1", input_memory1);
auto outputs = network.execute();
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
network->set_input_data("input1", input_memory1);
auto outputs = network->execute();
ASSERT_EQ(outputs.size(), static_cast<size_t>(2));
auto out1 = outputs.at("out3");
@ -533,4 +536,80 @@ TEST(memory_pool, add_mem_dep_test) {
ASSERT_EQ(out2_ptr[1], 6.0f);
ASSERT_EQ(out2_ptr[2], 7.0f);
ASSERT_EQ(out2_ptr[3], 8.0f);
}
};
TEST_F(memory_pool, basic_non_padded_relu_pipe) {
this->test_basic_non_padded_relu_pipe(false);
}
TEST_F(memory_pool, basic_non_padded_relu_and_pooling_pipe) {
this->test_basic_non_padded_relu_and_pooling_pipe(false);
}
TEST_F(memory_pool, multi_outputs_network) {
this->test_multi_outputs_network(false);
}
TEST_F(memory_pool, oooq) {
this->test_oooq(false);
}
TEST_F(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) {
this->test_shared_mem_pool_same_topology_twice();
}
TEST_F(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) {
this->test_shared_mem_pool_same_topology_twice_weights();
}
TEST_F(memory_pool, shared_mem_pool_diff_batches) {
this->test_shared_mem_pool_diff_batches(false);
}
TEST_F(memory_pool, shared_dep_two_output) {
this->test_shared_dep_two_output(false);
}
TEST_F(memory_pool, non_opt_intermidate_opt_after) {
this->test_non_opt_intermidate_opt_after(false);
}
TEST_F(memory_pool, add_mem_dep_test) {
this->test_add_mem_dep(false);
}
#ifdef RUN_ALL_MODEL_CACHING_TESTS
TEST_F(memory_pool, basic_non_padded_relu_pipe_cached) {
this->test_basic_non_padded_relu_pipe(true);
}
TEST_F(memory_pool, basic_non_padded_relu_and_pooling_pipe_cached) {
this->test_basic_non_padded_relu_and_pooling_pipe(true);
}
TEST_F(memory_pool, multi_outputs_network_cached) {
this->test_multi_outputs_network(true);
}
TEST_F(memory_pool, oooq_cached) {
this->test_oooq(true);
}
TEST_F(memory_pool, shared_mem_pool_diff_batches_cached) {
this->test_shared_mem_pool_diff_batches(true);
}
TEST_F(memory_pool, shared_dep_two_output_cached) {
this->test_shared_dep_two_output(true);
}
TEST_F(memory_pool, non_opt_intermidate_opt_after_cached) {
this->test_non_opt_intermidate_opt_after(true);
}
#endif
TEST_F(memory_pool, add_mem_dep_test_cached) {
this->test_add_mem_dep(true);
}
}