diff --git a/src/plugins/intel_gpu/src/graph/data.cpp b/src/plugins/intel_gpu/src/graph/data.cpp index 6de2652d429..dcc17faf531 100644 --- a/src/plugins/intel_gpu/src/graph/data.cpp +++ b/src/plugins/intel_gpu/src/graph/data.cpp @@ -92,7 +92,7 @@ void data_inst::load(BinaryInputBuffer& ib) { pos += data_size; ib.seekg(pos); } else { - _outputs[0] = get_network().get_memory_pool().get_memory(output_layout, _allocation_type, false); + _outputs[0] = get_network().get_engine().allocate_memory(output_layout, _allocation_type, false); if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { ib >> make_data(_outputs[0]->buffer_ptr(), data_size); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/oooq_memory_dependencies.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/oooq_memory_dependencies.cpp index f4905ebca2b..1fab112e6f1 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/oooq_memory_dependencies.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/oooq_memory_dependencies.cpp @@ -168,6 +168,14 @@ void oooq_memory_dependencies::run(program& p) { if (!are_connected(A, B)) { add_memory_dependency(*itr_A, *itr_B); add_memory_dependency(*itr_B, *itr_A); + } else { + for (auto u : (*itr_A)->get_users()) { + if (u != (*itr_B) && !are_connected(B, user_map[u]) && !are_connected(user_map[u], B)) { + add_memory_dependency(*itr_A, *itr_B); + add_memory_dependency(*itr_B, *itr_A); + break; + } + } } itr_B++; B++; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 0ab0a8c9307..8ef109b5510 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -392,29 +392,34 @@ network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, st _primitives[_primitive_id] = new_primitive_inst; } + std::vector> insts_to_allocate; size_t exec_order_size; ib >> exec_order_size; - _exec_order.clear(); - std::vector _exec_order_types; - _exec_order_types.resize(exec_order_size); - - for (auto& type : _exec_order_types) { + for (size_t i = 0; i < exec_order_size; ++i) { + std::string type; ib >> type; std::shared_ptr new_primitive_inst = prim_map_storage::instance().get_type_id(type)->create_instance(*this); - _exec_order.emplace_back(new_primitive_inst); + insts_to_allocate.emplace_back(new_primitive_inst); } _outputs.clear(); _output_chains.clear(); - for (const auto& p_inst : _exec_order) { + for (const auto& p_inst : insts_to_allocate) { ib >> *p_inst; _primitives[p_inst->id()] = p_inst; if (p_inst->get_impl() != nullptr) p_inst->init_by_cached_kernels(kernels_cache); } + std::vector exec_order_ids; + ib >> exec_order_ids; + _exec_order.clear(); + for (auto& exec_order_id : exec_order_ids) { + _exec_order.emplace_back(_primitives[exec_order_id]); + } + for (auto& item : _primitives) { auto& p_inst = item.second; if (p_inst->is_input()) @@ -552,14 +557,35 @@ void network::save(cldnn::BinaryOutputBuffer& ob) { size_t exec_order_size = _exec_order.size(); ob << exec_order_size; + std::unordered_map exec_order_num; + size_t i = exec_order_size; for (const auto& p_inst : _exec_order) { + exec_order_num[p_inst->id()] = --i; + } + + std::vector> insts_to_allocate(_exec_order.begin(), _exec_order.end()); + std::sort(insts_to_allocate.begin(), + insts_to_allocate.end(), + [&exec_order_num, &exec_order_size](std::shared_ptr const& lhs, std::shared_ptr const& rhs) { + size_t lhs_size = (lhs->mem_allocated()) ? (lhs->get_output_layout().bytes_count() + exec_order_size) : exec_order_num[lhs->id()]; + size_t rhs_size = (rhs->mem_allocated()) ? (rhs->get_output_layout().bytes_count() + exec_order_size) : exec_order_num[rhs->id()]; + return (lhs_size > rhs_size); + }); + + for (const auto& p_inst : insts_to_allocate) { ob << p_inst->get_node().get_primitive()->type_string(); } - for (const auto& p_inst : _exec_order) { + for (const auto& p_inst : insts_to_allocate) { ob << *p_inst; } + std::vector exec_order_ids; + for (const auto& p_inst : _exec_order) { + exec_order_ids.emplace_back(p_inst->id()); + } + ob << exec_order_ids; + std::map reuse_map; auto& po = _program->get_processing_order(); diff --git a/src/plugins/intel_gpu/tests/test_cases/memory_test.cpp b/src/plugins/intel_gpu/tests/test_cases/memory_test.cpp index 0d8c2df6c67..d07b59a3555 100644 --- a/src/plugins/intel_gpu/tests/test_cases/memory_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/memory_test.cpp @@ -54,483 +54,562 @@ TEST(memory_tests, DISABLED_network_creation_loop) } } #endif -TEST(memory_pool, basic_non_padded_relu_pipe) { - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - auto batch_num = 1; - auto feature_num = 4; - auto x_size = 1; - auto y_size = 1; +namespace { +class memory_pool: public ::testing::Test { +public: + void test_basic_non_padded_relu_pipe(bool is_caching_test) { + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + auto batch_num = 1; + auto feature_num = 4; + auto x_size = 1; + auto y_size = 1; - auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); - topology topology; - topology.add(input_layout("input", input->get_layout())); - topology.add(activation("relu", input_info("input"), activation_func::relu)); - topology.add(activation("relu1", input_info("relu"), activation_func::relu)); - topology.add(activation("relu2", input_info("relu1"), activation_func::relu)); - topology.add(activation("relu3", input_info("relu2"), activation_func::relu)); - topology.add(activation("relu4", input_info("relu3"), activation_func::relu)); - topology.add(activation("relu5", input_info("relu4"), activation_func::relu)); + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(activation("relu", input_info("input"), activation_func::relu)); + topology.add(activation("relu1", input_info("relu"), activation_func::relu)); + topology.add(activation("relu2", input_info("relu1"), activation_func::relu)); + topology.add(activation("relu3", input_info("relu2"), activation_func::relu)); + topology.add(activation("relu4", input_info("relu3"), activation_func::relu)); + topology.add(activation("relu5", input_info("relu4"), activation_func::relu)); - std::vector input_vec = { -1.f, 2.f, -3.f, 4.f }; - set_values(input, input_vec); - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); + std::vector input_vec = { -1.f, 2.f, -3.f, 4.f }; + set_values(input, input_vec); + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); - network network(*engine, topology, config); - network.set_input_data("input", input); - auto outputs = network.execute(); + network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input); + auto outputs = network->execute(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)64); - } + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)64); + } -TEST(memory_pool, basic_non_padded_relu_and_pooling_pipe) { - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - auto batch_num = 1; - auto feature_num = 4; - auto x_size = 4; - auto y_size = 4; + void test_basic_non_padded_relu_and_pooling_pipe(bool is_caching_test) { + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + auto batch_num = 1; + auto feature_num = 4; + auto x_size = 4; + auto y_size = 4; - auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); - topology topology; - topology.add(input_layout("input", input->get_layout())); - topology.add(activation("relu", input_info("input"), activation_func::relu)); - topology.add(activation("relu1", input_info("relu"), activation_func::relu)); - topology.add(pooling("pool1", input_info("relu1"), pooling_mode::max, { 3, 3 }, { 2, 2 })); - topology.add(activation("relu2", input_info("pool1"), activation_func::relu)); - topology.add(activation("relu3", input_info("relu2"), activation_func::relu)); - topology.add(activation("relu4", input_info("relu3"), activation_func::relu)); - topology.add(activation("relu5", input_info("relu4"), activation_func::relu)); + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(activation("relu", input_info("input"), activation_func::relu)); + topology.add(activation("relu1", input_info("relu"), activation_func::relu)); + topology.add(pooling("pool1", input_info("relu1"), pooling_mode::max, { 3, 3 }, { 2, 2 })); + topology.add(activation("relu2", input_info("pool1"), activation_func::relu)); + topology.add(activation("relu3", input_info("relu2"), activation_func::relu)); + topology.add(activation("relu4", input_info("relu3"), activation_func::relu)); + topology.add(activation("relu5", input_info("relu4"), activation_func::relu)); - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); - network network(*engine, topology, config); - network.set_input_data("input", input); - auto outputs = network.execute(); + network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input); + auto outputs = network->execute(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)896); -} + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)896); + } -TEST(memory_pool, multi_outputs_network) { - // -- relu -- relu1 -- relu4 - // input< - // -- relu2 -- relu3 -- relu5--relu6--relu7 - // neither of relu5, relu6 nor relu7 can share resource with relu4. + void test_multi_outputs_network(bool is_caching_test) { + // -- relu -- relu1 -- relu4 + // input< + // -- relu2 -- relu3 -- relu5--relu6--relu7 + // neither of relu5, relu6 nor relu7 can share resource with relu4. - auto engine = create_test_engine(); - auto batch_num = 1; - auto feature_num = 4; - auto x_size = 4; - auto y_size = 4; + auto engine = create_test_engine(); + auto batch_num = 1; + auto feature_num = 4; + auto x_size = 4; + auto y_size = 4; - auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); - topology topology; - topology.add(input_layout("input", input->get_layout())); - topology.add(activation("relu", input_info("input"), activation_func::relu)); - topology.add(activation("relu1", input_info("relu"), activation_func::relu)); - topology.add(activation("relu2", input_info("input"), activation_func::relu)); - topology.add(activation("relu3", input_info("relu2"), activation_func::relu)); - topology.add(activation("relu4", input_info("relu1"), activation_func::relu)); - topology.add(activation("relu5", input_info("relu3"), activation_func::relu)); - topology.add(activation("relu6", input_info("relu5"), activation_func::relu)); - topology.add(activation("relu7", input_info("relu6"), activation_func::relu)); + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(activation("relu", input_info("input"), activation_func::relu)); + topology.add(activation("relu1", input_info("relu"), activation_func::relu)); + topology.add(activation("relu2", input_info("input"), activation_func::relu)); + topology.add(activation("relu3", input_info("relu2"), activation_func::relu)); + topology.add(activation("relu4", input_info("relu1"), activation_func::relu)); + topology.add(activation("relu5", input_info("relu3"), activation_func::relu)); + topology.add(activation("relu6", input_info("relu5"), activation_func::relu)); + topology.add(activation("relu7", input_info("relu6"), activation_func::relu)); - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); - network network(*engine, topology, config); - network.set_input_data("input", input); - auto outputs = network.execute(); + network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input); + auto outputs = network->execute(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 1536); -} + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 1536); + } -TEST(memory_pool, oooq) { - /* -- relu1 - concat1- relu4 -- - input< -- relu2 / >-- concat2 -- relu6 - -- relu3 -- relu5 --------- - neither of relu5, relu6 nor relu7 can share resource with relu4. */ + void test_oooq(bool is_caching_test) { + /* -- relu1 - concat1- relu4 -- + input< -- relu2 / >-- concat2 -- relu6 + -- relu3 -- relu5 --------- + neither of relu5, relu6 nor relu7 can share resource with relu4. */ - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - auto batch_num = 1; - auto feature_num = 4; - auto x_size = 4; - auto y_size = 4; + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + auto batch_num = 1; + auto feature_num = 4; + auto x_size = 4; + auto y_size = 4; - auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); + auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(x_size, y_size), feature(feature_num), batch(batch_num)) } }); - topology topology; - topology.add(input_layout("input", input->get_layout())); - topology.add(activation("relu1", input_info("input"), activation_func::relu)); - topology.add(activation("relu2", input_info("input"), activation_func::relu)); - topology.add(activation("relu3", input_info("input"), activation_func::relu)); - topology.add(concatenation("concat1", { input_info("relu1"), input_info("relu2") }, 1)); - topology.add(activation("relu4", input_info("concat1"), activation_func::relu)); - topology.add(activation("relu5", input_info("relu3"), activation_func::relu)); - topology.add(concatenation("concat2", { input_info("relu4"), input_info("relu5") }, 1)); - topology.add(activation("relu6", input_info("concat2"), activation_func::relu)); + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(activation("relu1", input_info("input"), activation_func::relu)); + topology.add(activation("relu2", input_info("input"), activation_func::relu)); + topology.add(activation("relu3", input_info("input"), activation_func::relu)); + topology.add(concatenation("concat1", { input_info("relu1"), input_info("relu2") }, 1)); + topology.add(activation("relu4", input_info("concat1"), activation_func::relu)); + topology.add(activation("relu5", input_info("relu3"), activation_func::relu)); + topology.add(concatenation("concat2", { input_info("relu4"), input_info("relu5") }, 1)); + topology.add(activation("relu6", input_info("concat2"), activation_func::relu)); - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); - network network(*engine, topology, config); - network.set_input_data("input", input); - auto outputs = network.execute(); + network::ptr network = get_network(*engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input", input); + auto outputs = network->execute(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560); -} + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560); + } -TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) { - /* -- relu1 - concat1- relu4 -- - input< -- relu2 | >-- concat2 -- relu6 - -- relu3 -- relu5 --------- - neither of relu5, relu6 nor relu7 can share resource with relu4. */ + void test_shared_mem_pool_same_topology_twice() { + /* -- relu1 - concat1- relu4 -- + input< -- relu2 | >-- concat2 -- relu6 + -- relu3 -- relu5 --------- + neither of relu5, relu6 nor relu7 can share resource with relu4. */ - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - auto batch_num = 1; - auto feature_num = 4; - auto inp_x_size = 4; - auto inp_y_size = 4; + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + auto batch_num = 1; + auto feature_num = 4; + auto inp_x_size = 4; + auto inp_y_size = 4; - auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } }); + auto input = engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } }); - set_values(input, - { 1.0f, 2.5f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 6.1f, 4.7f, 1.0f, 1.0f, 8.2f, 1.0f, 2.0f, 1.0f, - 5.0f, 2.0f, 2.0f, 3.0f, 5.0f, 2.0f, 2.0f, 3.0f, 1.1f, 2.4f, 1.0f, 1.0f, 4.0f, 6.0f, 3.0f, 3.6f, - 4.0f, 6.0f, 3.0f, 3.0f, 1.0f, 1.0f, 1.5f, 1.0f, 4.0f, 6.5f, 3.0f, 3.0f, 4.0f, 6.0f, 1.8f, 3.5f, - 3.0f, 5.0f, 1.0f, 1.0f, 1.3f, 1.0f, 0.4f, 1.3f, 4.0f, 7.0f, 3.0f, 3.0f, 1.0f, 2.0f, 3.9f, 4.0f - }); + set_values(input, + { 1.0f, 2.5f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 6.1f, 4.7f, 1.0f, 1.0f, 8.2f, 1.0f, 2.0f, 1.0f, + 5.0f, 2.0f, 2.0f, 3.0f, 5.0f, 2.0f, 2.0f, 3.0f, 1.1f, 2.4f, 1.0f, 1.0f, 4.0f, 6.0f, 3.0f, 3.6f, + 4.0f, 6.0f, 3.0f, 3.0f, 1.0f, 1.0f, 1.5f, 1.0f, 4.0f, 6.5f, 3.0f, 3.0f, 4.0f, 6.0f, 1.8f, 3.5f, + 3.0f, 5.0f, 1.0f, 1.0f, 1.3f, 1.0f, 0.4f, 1.3f, 4.0f, 7.0f, 3.0f, 3.0f, 1.0f, 2.0f, 3.9f, 4.0f + }); - topology topology; - topology.add(input_layout("input", input->get_layout())); - topology.add(activation("relu1", input_info("input"), activation_func::relu)); - topology.add(activation("relu2", input_info("input"), activation_func::sqrt)); - topology.add(activation("relu3", input_info("input"), activation_func::square)); - topology.add(concatenation("concat1", { input_info("relu1"), input_info("relu2") }, 1)); - topology.add(activation("relu4", input_info("concat1"), activation_func::relu)); - topology.add(activation("relu5", input_info("relu3"), activation_func::relu)); - topology.add(concatenation("concat2", { input_info("relu4"), input_info("relu5") }, 1)); - topology.add(activation("relu6", input_info("concat2"), activation_func::linear, { 1.0f, 0.5f })); + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(activation("relu1", input_info("input"), activation_func::relu)); + topology.add(activation("relu2", input_info("input"), activation_func::sqrt)); + topology.add(activation("relu3", input_info("input"), activation_func::square)); + topology.add(concatenation("concat1", { input_info("relu1"), input_info("relu2") }, 1)); + topology.add(activation("relu4", input_info("concat1"), activation_func::relu)); + topology.add(activation("relu5", input_info("relu3"), activation_func::relu)); + topology.add(concatenation("concat2", { input_info("relu4"), input_info("relu5") }, 1)); + topology.add(activation("relu6", input_info("concat2"), activation_func::linear, { 1.0f, 0.5f })); - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); - network network_first(*engine, topology, config); - network_first.set_input_data("input", input); - auto outputs = network_first.execute(); + network network_first(*engine, topology, config); + network_first.set_input_data("input", input); + auto outputs = network_first.execute(); - auto output_memory_first = outputs.at("relu6").get_memory(); - auto output_layout_first = output_memory_first->get_layout(); - cldnn::mem_lock output_ptr_first(output_memory_first, get_test_stream()); + auto output_memory_first = outputs.at("relu6").get_memory(); + auto output_layout_first = output_memory_first->get_layout(); + cldnn::mem_lock output_ptr_first(output_memory_first, get_test_stream()); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560); + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 2560); - network network_second(*engine, topology, config); - network_second.set_input_data("input", input); - auto outputs_second = network_second.execute(); + network network_second(*engine, topology, config); + network_second.set_input_data("input", input); + auto outputs_second = network_second.execute(); - auto output_memory_second = outputs_second.at("relu6").get_memory(); - auto output_layout_second = output_memory_second->get_layout(); - cldnn::mem_lock output_ptr_second(output_memory_second, get_test_stream()); + auto output_memory_second = outputs_second.at("relu6").get_memory(); + auto output_layout_second = output_memory_second->get_layout(); + cldnn::mem_lock output_ptr_second(output_memory_second, get_test_stream()); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 3328); + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t) 3328); - ASSERT_EQ(output_layout_first, output_layout_second); + ASSERT_EQ(output_layout_first, output_layout_second); - int y_size = output_layout_first.spatial(1); - int x_size = output_layout_first.spatial(0); - int f_size = output_layout_first.feature(); - int b_size = output_layout_first.batch(); - int f_offset = y_size*x_size; - int b_offset = f_size * f_offset; - for (int b = 0; b < b_size; ++b) - { - for (int f = 0; f < f_size; ++f) + int y_size = output_layout_first.spatial(1); + int x_size = output_layout_first.spatial(0); + int f_size = output_layout_first.feature(); + int b_size = output_layout_first.batch(); + int f_offset = y_size*x_size; + int b_offset = f_size * f_offset; + for (int b = 0; b < b_size; ++b) { - for (int y = 0; y < y_size; ++y) + for (int f = 0; f < f_size; ++f) { - for (int x = 0; x < x_size; ++x) + for (int y = 0; y < y_size; ++y) { - int idx = b * b_offset + f * f_offset + y * x_size + x; - ASSERT_EQ(output_ptr_first[idx], output_ptr_second[idx]); + for (int x = 0; x < x_size; ++x) + { + int idx = b * b_offset + f * f_offset + y * x_size + x; + ASSERT_EQ(output_ptr_first[idx], output_ptr_second[idx]); + } } } } } -} -TEST(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) { - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - auto batch_num = 1; - auto feature_num = 3; - auto inp_x_size = 4; - auto inp_y_size = 4; + void test_shared_mem_pool_same_topology_twice_weights() { + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + auto batch_num = 1; + auto feature_num = 3; + auto inp_x_size = 4; + auto inp_y_size = 4; - auto input= engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } }); - auto weights = engine->allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 3, 2 } }); + auto input= engine->allocate_memory({ data_types::f32, format::bfyx, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_num)) } }); + auto weights = engine->allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 3, 2 } }); - std::vector dummy_input_data_1 = { - /*f0 xy*/ 0.8f, 0.65f, 0.1f, 1.0f, 1.0f, 0.5f, 0.11f, 0.33f, 0.66f, 0.11f, 0.22f, 0.33f, 0.99f, 0.8f, 0.7f, 0.5f, - /*f1 xy*/ 0.48f, 0.05f, 0.35f, 1.0f, 1.0f, 0.51f, 0.51f, 0.13f, 0.86f, 0.10f, 0.29f, 0.53f, 0.99f, 0.4f, 0.3f, 0.1f, - /*f2 xy*/ 0.98f, 0.35f, 0.3f, 0.01f, 0.9f, 0.55f, 0.15f, 0.39f, 0.36f, 0.01f, 0.32f, 0.4f, 0.3f, 0.2f, 0.1f, 0.5f, - }; + std::vector dummy_input_data_1 = { + /*f0 xy*/ 0.8f, 0.65f, 0.1f, 1.0f, 1.0f, 0.5f, 0.11f, 0.33f, 0.66f, 0.11f, 0.22f, 0.33f, 0.99f, 0.8f, 0.7f, 0.5f, + /*f1 xy*/ 0.48f, 0.05f, 0.35f, 1.0f, 1.0f, 0.51f, 0.51f, 0.13f, 0.86f, 0.10f, 0.29f, 0.53f, 0.99f, 0.4f, 0.3f, 0.1f, + /*f2 xy*/ 0.98f, 0.35f, 0.3f, 0.01f, 0.9f, 0.55f, 0.15f, 0.39f, 0.36f, 0.01f, 0.32f, 0.4f, 0.3f, 0.2f, 0.1f, 0.5f, + }; - set_values(input, dummy_input_data_1); - set_values(weights, { 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f }); + set_values(input, dummy_input_data_1); + set_values(weights, { 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f }); - topology topology( - input_layout("input", input->get_layout()), - data("weights", weights), - convolution("conv", input_info("input"), { "weights" }, { 1, 1, 1, 2 }), - softmax("softmax", input_info("conv"))); + topology topology( + input_layout("input", input->get_layout()), + data("weights", weights), + convolution("conv", input_info("input"), { "weights" }, { 1, 1, 1, 2 }), + softmax("softmax", input_info("conv"))); - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); - network network_first(*engine, topology, config); - network_first.set_input_data("input", input); - auto outputs = network_first.execute(); - uint64_t cl_mem_result = 824; - uint64_t usm_result = 1208; // USM have a higher peak, since transfering memory to device adds temporay memory bytes allocated. Old memory is deallocated quickly, but max peak is higher. - auto is_correct = engine->get_max_used_device_memory() == cl_mem_result - || engine->get_max_used_device_memory() == usm_result; - ASSERT_TRUE(is_correct) << "Memory max peak is not correct"; + network network_first(*engine, topology, config); + network_first.set_input_data("input", input); + auto outputs = network_first.execute(); + uint64_t cl_mem_result = 824; + uint64_t usm_result = 1208; // USM have a higher peak, since transfering memory to device adds temporay memory bytes allocated. Old memory is deallocated quickly, but max peak is higher. + auto is_correct = engine->get_max_used_device_memory() == cl_mem_result + || engine->get_max_used_device_memory() == usm_result; + ASSERT_TRUE(is_correct) << "Memory max peak is not correct"; - auto output_memory_first = outputs.at("softmax").get_memory(); - auto output_layout_first = output_memory_first->get_layout(); - cldnn::mem_lock output_ptr_first(output_memory_first, get_test_stream()); + auto output_memory_first = outputs.at("softmax").get_memory(); + auto output_layout_first = output_memory_first->get_layout(); + cldnn::mem_lock output_ptr_first(output_memory_first, get_test_stream()); - network network_second(*engine, topology, config); - network_second.set_input_data("input", input); - auto outputs_second = network_second.execute(); + network network_second(*engine, topology, config); + network_second.set_input_data("input", input); + auto outputs_second = network_second.execute(); - auto output_memory_second = outputs_second.at("softmax").get_memory(); - auto output_layout_second = output_memory_second->get_layout(); - cldnn::mem_lock output_ptr_second(output_memory_second, get_test_stream()); + auto output_memory_second = outputs_second.at("softmax").get_memory(); + auto output_layout_second = output_memory_second->get_layout(); + cldnn::mem_lock output_ptr_second(output_memory_second, get_test_stream()); - cl_mem_result = 1224; - usm_result = 1992; // USM have a higher peak, since transfering memory to device adds temporay memory bytes allocated. Old memory is deallocated quickly, but max peak is higher. - is_correct = engine->get_max_used_device_memory() == cl_mem_result - || engine->get_max_used_device_memory() == usm_result; - ASSERT_TRUE(is_correct) << "Memory max peak is not correct"; - ASSERT_EQ(output_layout_first, output_layout_second); + cl_mem_result = 1224; + usm_result = 1992; // USM have a higher peak, since transfering memory to device adds temporay memory bytes allocated. Old memory is deallocated quickly, but max peak is higher. + is_correct = engine->get_max_used_device_memory() == cl_mem_result + || engine->get_max_used_device_memory() == usm_result; + ASSERT_TRUE(is_correct) << "Memory max peak is not correct"; + ASSERT_EQ(output_layout_first, output_layout_second); - int y_size = output_layout_first.spatial(1); - int x_size = output_layout_first.spatial(0); - int f_size = output_layout_first.feature(); - int b_size = output_layout_first.batch(); - int f_offset = y_size * x_size; - int b_offset = f_size * f_offset; - for (int b = 0; b < b_size; ++b) - { - for (int f = 0; f < f_size; ++f) + int y_size = output_layout_first.spatial(1); + int x_size = output_layout_first.spatial(0); + int f_size = output_layout_first.feature(); + int b_size = output_layout_first.batch(); + int f_offset = y_size * x_size; + int b_offset = f_size * f_offset; + for (int b = 0; b < b_size; ++b) { - for (int y = 0; y < y_size; ++y) + for (int f = 0; f < f_size; ++f) { - for (int x = 0; x < x_size; ++x) + for (int y = 0; y < y_size; ++y) { - int idx = b * b_offset + f * f_offset + y * x_size + x; - ASSERT_EQ(output_ptr_first[idx], output_ptr_second[idx]); + for (int x = 0; x < x_size; ++x) + { + int idx = b * b_offset + f * f_offset + y * x_size + x; + ASSERT_EQ(output_ptr_first[idx], output_ptr_second[idx]); + } } } } } + + void test_shared_mem_pool_diff_batches(bool is_caching_test) { + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + auto batch_8 = 8; + auto batch_1 = 1; + auto feature_num = 3; + auto inp_x_size = 4; + auto inp_y_size = 4; + auto dt = data_types::f32; + auto fmt = format::bfyx; + layout lay_batch_1 = { dt, fmt, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_1)) }}; + layout lay_batch_8 = { dt, fmt, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_8)) }}; + auto input_1 = engine->allocate_memory(lay_batch_1); + auto input_8 = engine->allocate_memory(lay_batch_8); + auto weights = engine->allocate_memory({ dt, fmt, { 1, 3, 3, 2 } }); + + std::vector dummy_input_data_1 = generate_random_1d(batch_1 * feature_num * inp_x_size * inp_y_size, 0, 1); + std::vector dummy_input_data_8 = generate_random_1d(batch_8 * feature_num * inp_x_size * inp_y_size, 0, 1); + + set_values(input_1, dummy_input_data_1); + set_values(input_8, dummy_input_data_8); + set_values(weights, { 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f, + 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f, + 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f }); + + topology topo( + input_layout("input", input_8->get_layout()), + data("weights", weights), + convolution("conv", input_info("input"), { "weights" }, { 2, 1 }), + softmax("softmax", input_info("conv"))); + + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network_first = get_network(*engine, topo, config, get_test_stream_ptr(), is_caching_test); + network_first->set_input_data("input", input_8); + auto outputs = network_first->execute(); + + auto dev_info = engine->get_device_info(); + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)4744); + + topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1 + + network::ptr network_second = get_network(*engine, topo, config, get_test_stream_ptr(), is_caching_test); + network_second->set_input_data("input", input_1); + auto outputs_second = network_second->execute(); + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)5912); + } + + void test_shared_dep_two_output(bool is_caching_test) { + // We need a new engine here to get correct get_max_used_device_memory() result + // If we reuse common engine, then max memory value will be taken from some previously executed tests + // as it's tracked within engine instance + auto engine = create_test_engine(); + + auto input_1 = engine->allocate_memory({ {1, 1, 4, 4}, data_types::f32, format::bfyx }); + set_random_values(input_1); + + //build and execute network + topology topo; + topo.add(cldnn::data("constant_0_0", input_1)); + topo.add(cldnn::concatenation("result_1_0", { input_info("constant_0_0") }, 0)); + topo.add(cldnn::concatenation("result_2_0", { input_info("constant_0_0") }, 0)); + + ExecutionConfig config = get_test_default_config(*engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network = get_network(*engine, topo, config, get_test_stream_ptr(), is_caching_test); + auto outputs = network->execute(); + ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)192); + } + + void test_non_opt_intermidate_opt_after(bool is_caching_test) { + auto& engine = get_test_engine(); + auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 }); + auto input_layout2 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 }); + + auto input_memory1 = engine.allocate_memory(input_layout1); + auto input_memory2 = engine.allocate_memory(input_layout2); + auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 })); + auto data_memory = cldnn::data("scale_mem", scale_memory); + + set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f }); + set_values(input_memory2, { 5.0f, 6.0f, 7.0f, 8.0f }); + set_values(scale_memory, { 1.0f }); + + auto reshape_tensor = cldnn::tensor(8, 1, 1, 1); + auto input = cldnn::input_layout("input1", input_layout1); + auto input2 = cldnn::input_layout("input2", input_layout2); + auto concat = cldnn::concatenation("concat", { input_info("input1"), input_info("input2") }, 0); + auto reshape = cldnn::reshape("reshape", input_info("concat"), reshape_tensor); + auto crop1 = cldnn::crop("crop1", input_info("reshape"), { 1, 1, 1, 1 }, { 0, 0, 0, 0 }); + auto crop2 = cldnn::crop("crop2", input_info("reshape"), { 1, 1, 1, 1 }, { 1, 0, 0, 0 }); + auto eltwise1 = cldnn::eltwise("elt1", { input_info("crop1"), input_info("scale_mem") }, eltwise_mode::prod); + auto eltwise2 = cldnn::eltwise("elt2", { input_info("crop2"), input_info("scale_mem") }, eltwise_mode::prod); + + auto topology = cldnn::topology( + input, input2, + concat, + reshape, + crop1, crop2, + eltwise1, eltwise2, + data_memory + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(false)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input1", input_memory1); + network->set_input_data("input2", input_memory2); + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), static_cast(2)); + + auto out1 = outputs.at("elt1"); + auto out2 = outputs.at("elt2"); + + cldnn::mem_lock out1_ptr(out1.get_memory(), get_test_stream()); + cldnn::mem_lock out2_ptr(out2.get_memory(), get_test_stream()); + ASSERT_EQ(out1_ptr[0], 1.0f); + ASSERT_EQ(out2_ptr[0], 2.0f); + } + + void test_add_mem_dep(bool is_caching_test) { + auto& engine = get_test_engine(); + + auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 2, 2, 2 }); + + auto input_memory1 = engine.allocate_memory(input_layout1); + auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 })); + auto data_memory = cldnn::data("scale_mem", scale_memory); + + set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f}); + set_values(scale_memory, { 1.0f }); + + auto input = cldnn::input_layout("input1", input_layout1); + auto actv1 = cldnn::activation("input_activ1", input_info("input1"), activation_func::abs); + auto actv2 = cldnn::activation("input_activ2", input_info("input1"), activation_func::abs); + auto crop1 = cldnn::crop("crop1", input_info("input_activ1"), { 1, 1, 2, 2 }, { 0, 0, 0, 0 }); + auto crop2 = cldnn::crop("crop2", input_info("input_activ2"), { 1, 1, 2, 2 }, { 0, 1, 0, 0 }); + auto eltwise1 = cldnn::eltwise("elt1", { input_info("crop1"), input_info("scale_mem") }, eltwise_mode::prod); + auto eltwise2 = cldnn::eltwise("elt2", { input_info("crop2"), input_info("scale_mem") }, eltwise_mode::prod); + auto actv3 = cldnn::activation("out3", input_info("elt1"), activation_func::abs); + auto actv4 = cldnn::activation("out4", input_info("elt2"), activation_func::abs); + + auto topology = cldnn::topology( + input, + crop1, crop2, + actv1, actv2, + eltwise1, eltwise2, + data_memory, + actv3, actv4 + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input1", input_memory1); + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), static_cast(2)); + + auto out1 = outputs.at("out3"); + auto out2 = outputs.at("out4"); + + cldnn::mem_lock out1_ptr(out1.get_memory(), get_test_stream()); + cldnn::mem_lock out2_ptr(out2.get_memory(), get_test_stream()); + ASSERT_EQ(out1_ptr[0], 1.0f); + ASSERT_EQ(out1_ptr[1], 2.0f); + ASSERT_EQ(out1_ptr[2], 3.0f); + ASSERT_EQ(out1_ptr[3], 4.0f); + + ASSERT_EQ(out2_ptr[0], 5.0f); + ASSERT_EQ(out2_ptr[1], 6.0f); + ASSERT_EQ(out2_ptr[2], 7.0f); + ASSERT_EQ(out2_ptr[3], 8.0f); + } +}; + +TEST_F(memory_pool, basic_non_padded_relu_pipe) { + this->test_basic_non_padded_relu_pipe(false); } -TEST(memory_pool, shared_mem_pool_diff_batches) { - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - auto batch_8 = 8; - auto batch_1 = 1; - auto feature_num = 3; - auto inp_x_size = 4; - auto inp_y_size = 4; - auto dt = data_types::f32; - auto fmt = format::bfyx; - layout lay_batch_1 = { dt, fmt, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_1)) }}; - layout lay_batch_8 = { dt, fmt, { tensor(spatial(inp_x_size, inp_y_size), feature(feature_num), batch(batch_8)) }}; - auto input_1 = engine->allocate_memory(lay_batch_1); - auto input_8 = engine->allocate_memory(lay_batch_8); - auto weights = engine->allocate_memory({ dt, fmt, { 1, 3, 3, 2 } }); - - std::vector dummy_input_data_1 = generate_random_1d(batch_1 * feature_num * inp_x_size * inp_y_size, 0, 1); - std::vector dummy_input_data_8 = generate_random_1d(batch_8 * feature_num * inp_x_size * inp_y_size, 0, 1); - - set_values(input_1, dummy_input_data_1); - set_values(input_8, dummy_input_data_8); - set_values(weights, { 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f, - 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f, - 0.10f, 0.2f, 0.1f, 0.2f, 0.1f, 0.2f }); - - topology topo( - input_layout("input", input_8->get_layout()), - data("weights", weights), - convolution("conv", input_info("input"), { "weights" }, { 2, 1 }), - softmax("softmax", input_info("conv"))); - - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); - - network network_first(*engine, topo, config); - network_first.set_input_data("input", input_8); - auto outputs = network_first.execute(); - - auto dev_info = engine->get_device_info(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)4744); - - topo.change_input_layout("input", input_1->get_layout());//change input layout to batch=1 - - network network_second(*engine, topo, config); - network_second.set_input_data("input", input_1); - auto outputs_second = network_second.execute(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)5912); +TEST_F(memory_pool, basic_non_padded_relu_and_pooling_pipe) { + this->test_basic_non_padded_relu_and_pooling_pipe(false); } -TEST(memory_pool, shared_dep_two_output) { - // We need a new engine here to get correct get_max_used_device_memory() result - // If we reuse common engine, then max memory value will be taken from some previously executed tests - // as it's tracked within engine instance - auto engine = create_test_engine(); - - auto input_1 = engine->allocate_memory({ {1, 1, 4, 4}, data_types::f32, format::bfyx }); - set_random_values(input_1); - - //build and execute network - topology topo; - topo.add(cldnn::data("constant_0_0", input_1)); - topo.add(cldnn::concatenation("result_1_0", { input_info("constant_0_0") }, 0)); - topo.add(cldnn::concatenation("result_2_0", { input_info("constant_0_0") }, 0)); - - ExecutionConfig config = get_test_default_config(*engine); - config.set_property(ov::intel_gpu::optimize_data(true)); - - network network(*engine, topo, config); - auto outputs = network.execute(); - ASSERT_EQ(engine->get_max_used_device_memory(), (uint64_t)192); +TEST_F(memory_pool, multi_outputs_network) { + this->test_multi_outputs_network(false); } -TEST(memory_pool, non_opt_intermidate_opt_after) { - auto& engine = get_test_engine(); - auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 }); - auto input_layout2 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 2, 2 }); - - auto input_memory1 = engine.allocate_memory(input_layout1); - auto input_memory2 = engine.allocate_memory(input_layout2); - auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 })); - auto data_memory = cldnn::data("scale_mem", scale_memory); - - set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f }); - set_values(input_memory2, { 5.0f, 6.0f, 7.0f, 8.0f }); - set_values(scale_memory, { 1.0f }); - - auto reshape_tensor = cldnn::tensor(8, 1, 1, 1); - auto input = cldnn::input_layout("input1", input_layout1); - auto input2 = cldnn::input_layout("input2", input_layout2); - auto concat = cldnn::concatenation("concat", { input_info("input1"), input_info("input2") }, 0); - auto reshape = cldnn::reshape("reshape", input_info("concat"), reshape_tensor); - auto crop1 = cldnn::crop("crop1", input_info("reshape"), { 1, 1, 1, 1 }, { 0, 0, 0, 0 }); - auto crop2 = cldnn::crop("crop2", input_info("reshape"), { 1, 1, 1, 1 }, { 1, 0, 0, 0 }); - auto eltwise1 = cldnn::eltwise("elt1", { input_info("crop1"), input_info("scale_mem") }, eltwise_mode::prod); - auto eltwise2 = cldnn::eltwise("elt2", { input_info("crop2"), input_info("scale_mem") }, eltwise_mode::prod); - - auto topology = cldnn::topology( - input, input2, - concat, - reshape, - crop1, crop2, - eltwise1, eltwise2, - data_memory - ); - - ExecutionConfig config = get_test_default_config(engine); - config.set_property(ov::intel_gpu::optimize_data(false)); - - network network(engine, topology, config); - network.set_input_data("input1", input_memory1); - network.set_input_data("input2", input_memory2); - auto outputs = network.execute(); - ASSERT_EQ(outputs.size(), static_cast(2)); - - auto out1 = outputs.at("elt1"); - auto out2 = outputs.at("elt2"); - - cldnn::mem_lock out1_ptr(out1.get_memory(), get_test_stream()); - cldnn::mem_lock out2_ptr(out2.get_memory(), get_test_stream()); - ASSERT_EQ(out1_ptr[0], 1.0f); - ASSERT_EQ(out2_ptr[0], 2.0f); +TEST_F(memory_pool, oooq) { + this->test_oooq(false); } -TEST(memory_pool, add_mem_dep_test) { - auto& engine = get_test_engine(); - - auto input_layout1 = layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 2, 2, 2 }); - - auto input_memory1 = engine.allocate_memory(input_layout1); - auto scale_memory = engine.allocate_memory(layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 1, 1, 1 })); - auto data_memory = cldnn::data("scale_mem", scale_memory); - - set_values(input_memory1, { 1.0f, 2.0f, 3.0f, 4.0f, - 5.0f, 6.0f, 7.0f, 8.0f}); - set_values(scale_memory, { 1.0f }); - - auto input = cldnn::input_layout("input1", input_layout1); - auto actv1 = cldnn::activation("input_activ1", input_info("input1"), activation_func::abs); - auto actv2 = cldnn::activation("input_activ2", input_info("input1"), activation_func::abs); - auto crop1 = cldnn::crop("crop1", input_info("input_activ1"), { 1, 1, 2, 2 }, { 0, 0, 0, 0 }); - auto crop2 = cldnn::crop("crop2", input_info("input_activ2"), { 1, 1, 2, 2 }, { 0, 1, 0, 0 }); - auto eltwise1 = cldnn::eltwise("elt1", { input_info("crop1"), input_info("scale_mem") }, eltwise_mode::prod); - auto eltwise2 = cldnn::eltwise("elt2", { input_info("crop2"), input_info("scale_mem") }, eltwise_mode::prod); - auto actv3 = cldnn::activation("out3", input_info("elt1"), activation_func::abs); - auto actv4 = cldnn::activation("out4", input_info("elt2"), activation_func::abs); - - auto topology = cldnn::topology( - input, - crop1, crop2, - actv1, actv2, - eltwise1, eltwise2, - data_memory, - actv3, actv4 - ); - - ExecutionConfig config = get_test_default_config(engine); - config.set_property(ov::intel_gpu::optimize_data(true)); - network network(engine, topology, config); - network.set_input_data("input1", input_memory1); - auto outputs = network.execute(); - ASSERT_EQ(outputs.size(), static_cast(2)); - - auto out1 = outputs.at("out3"); - auto out2 = outputs.at("out4"); - - cldnn::mem_lock out1_ptr(out1.get_memory(), get_test_stream()); - cldnn::mem_lock out2_ptr(out2.get_memory(), get_test_stream()); - ASSERT_EQ(out1_ptr[0], 1.0f); - ASSERT_EQ(out1_ptr[1], 2.0f); - ASSERT_EQ(out1_ptr[2], 3.0f); - ASSERT_EQ(out1_ptr[3], 4.0f); - - ASSERT_EQ(out2_ptr[0], 5.0f); - ASSERT_EQ(out2_ptr[1], 6.0f); - ASSERT_EQ(out2_ptr[2], 7.0f); - ASSERT_EQ(out2_ptr[3], 8.0f); +TEST_F(memory_pool, DISABLED_shared_mem_pool_same_topology_twice) { + this->test_shared_mem_pool_same_topology_twice(); +} + +TEST_F(memory_pool, DISABLED_shared_mem_pool_same_topology_twice_weights) { + this->test_shared_mem_pool_same_topology_twice_weights(); +} + +TEST_F(memory_pool, shared_mem_pool_diff_batches) { + this->test_shared_mem_pool_diff_batches(false); +} + +TEST_F(memory_pool, shared_dep_two_output) { + this->test_shared_dep_two_output(false); +} + +TEST_F(memory_pool, non_opt_intermidate_opt_after) { + this->test_non_opt_intermidate_opt_after(false); +} + +TEST_F(memory_pool, add_mem_dep_test) { + this->test_add_mem_dep(false); +} + +#ifdef RUN_ALL_MODEL_CACHING_TESTS +TEST_F(memory_pool, basic_non_padded_relu_pipe_cached) { + this->test_basic_non_padded_relu_pipe(true); +} + +TEST_F(memory_pool, basic_non_padded_relu_and_pooling_pipe_cached) { + this->test_basic_non_padded_relu_and_pooling_pipe(true); +} + +TEST_F(memory_pool, multi_outputs_network_cached) { + this->test_multi_outputs_network(true); +} + +TEST_F(memory_pool, oooq_cached) { + this->test_oooq(true); +} + +TEST_F(memory_pool, shared_mem_pool_diff_batches_cached) { + this->test_shared_mem_pool_diff_batches(true); +} + +TEST_F(memory_pool, shared_dep_two_output_cached) { + this->test_shared_dep_two_output(true); +} + +TEST_F(memory_pool, non_opt_intermidate_opt_after_cached) { + this->test_non_opt_intermidate_opt_after(true); +} +#endif + +TEST_F(memory_pool, add_mem_dep_test_cached) { + this->test_add_mem_dep(true); +} }