From 5c17c7e0a0c3687f9709d7d5023a8b0a17d50447 Mon Sep 17 00:00:00 2001 From: Paul Youngsoo Ahn Date: Wed, 9 Nov 2022 10:34:40 +0900 Subject: [PATCH] [GPU] Fix multistream issue for dynamic shape (#13433) (#13433) - Separate kernels_cache::add_kernel from factory (choose_impl) - Reset kenrels_cache._kernels after kernels_cache.compile - Create cldnn unit test case to check multi-stream processing --- .../include/intel_gpu/graph/network.hpp | 18 +++++ .../include/intel_gpu/graph/program.hpp | 7 -- .../graph/graph_optimizer/compile_graph.cpp | 6 +- .../graph_optimizer/post_input_reorder.cpp | 4 + .../graph_optimizer/post_optimize_weights.cpp | 9 ++- .../remove_redundant_reorders.cpp | 7 +- .../src/graph/impls/ocl/primitive_base.hpp | 18 +++-- .../src/graph/include/primitive_inst.h | 3 + src/plugins/intel_gpu/src/graph/network.cpp | 8 ++ .../intel_gpu/src/graph/primitive_inst.cpp | 26 +++--- src/plugins/intel_gpu/src/graph/program.cpp | 4 - .../intel_gpu/src/runtime/kernels_cache.cpp | 64 ++++++++++++--- .../intel_gpu/src/runtime/kernels_cache.hpp | 2 + .../test_cases/multiple_streams_gpu_test.cpp | 79 +++++++++++++++++++ 14 files changed, 209 insertions(+), 46 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 42092db3e58..805db2b576b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -11,6 +11,7 @@ #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/stream.hpp" +#include "intel_gpu/runtime/lru_cache.hpp" #include #include @@ -216,6 +217,15 @@ public: /// Returns memory state @p variable_id of stateful network VariableState& get_variable_memory(const std::string &variable_id); + /// Return kernels_cache + kernels_cache& get_kernels_cache() const { return *_kernels_cache; } + + /// Return implentations_cache + ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } + + /// Return in_mem_kernels_cache + KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; } + private: using output_chains_map = std::map>>; uint32_t net_id = 0; @@ -249,5 +259,13 @@ private: void check_names(); void add_default_output_chains(); output_chains_map::iterator add_output_chain(std::shared_ptr& p_inst); + + std::unique_ptr _kernels_cache; + // Move from cldnn::program to cldnn::network for multi-threads issue. + std::unique_ptr _impls_cache; + std::unique_ptr _in_mem_kernels_cache; + // TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models. + const size_t _impls_cache_capacity = 0; + const size_t _in_mem_kernels_cache_capacity = 0; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index be10119bc6b..0c65e6ca846 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -247,8 +247,6 @@ public: void load_tuning_cache(); std::shared_ptr get_tuning_cache() const { return tuning_cache; } - ImplementationsCache& get_implementations_cache() const { return *_impls_cache; } - KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; } // returns {-1, -1} if it failed to estimate by allocating given batch size std::pair get_estimated_device_mem_usage(); @@ -261,11 +259,6 @@ private: stream::ptr _stream; // TODO: Consider moving it to engine std::unique_ptr _kernels_cache; - std::unique_ptr _impls_cache; - std::unique_ptr _in_mem_kernels_cache; - // TODO: initial version use unlimited caches. Need to adjust it once dynamic flow works on wide set of models. - const size_t _impls_cache_capacity = 0; - const size_t _in_mem_kernels_cache_capacity = 0; build_options options; std::list inputs; std::vector outputs; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index b5c6f224a75..af001ea65c3 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -34,9 +34,13 @@ void compile_graph::run(program& p) { for (size_t idx = 0; idx < proc_order.size(); idx++) { auto& node = *(std::next(proc_order.begin(), idx)); if (!node->is_type() && !(node->is_type() && node->get_dependencies().empty()) && !node->is_dynamic()) { - tasks.push_back([node, &exception] { + tasks.push_back([node, &p, &exception] { try { node->selected_impl = node->type()->choose_impl(*node); + if (node->selected_impl) { + auto kernel_ids = p.get_kernels_cache().add_kernels_source(node->selected_impl->get_kernels_source()); + node->selected_impl->set_kernel_ids(kernel_ids); + } } catch(...) { exception = std::current_exception(); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp index a646ac57ea2..ff95aa1ad86 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp @@ -66,6 +66,10 @@ void post_input_reorder::run(program& p) { reorder.get_output_layout(false); node->set_output_layout(previous_layout, false); reorder.set_selected_impl(reorder.type()->choose_impl(reorder)); + if (auto impl = reorder.get_selected_impl()) { + auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source()); + impl->set_kernel_ids(kernel_ids); + } } } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index 346318f74ad..6eca63349f7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -53,8 +53,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { // Don't run impl selection to avoid double compilation of reorder kernels // in main program and internal program for constant propagation - if (!g_node.is_constant()) - g_node.selected_impl = g_node.type()->choose_impl(g_node); + if (!g_node.is_constant()) { + g_node.set_selected_impl(g_node.type()->choose_impl(g_node)); + if (auto impl = g_node.get_selected_impl()) { + auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source()); + impl->set_kernel_ids(kernel_ids); + } + } } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index c90a282c8f1..126787b91ca 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -36,8 +36,11 @@ void remove_redundant_reorders::run(program& p) { return; node.set_unique_id(); - auto new_impl = node.type()->choose_impl(node); - node.set_selected_impl(std::move(new_impl)); + node.set_selected_impl(node.type()->choose_impl(node)); + if (auto impl = node.get_selected_impl()) { + auto kernel_ids = p.get_kernels_cache().add_kernels_source(impl->get_kernels_source()); + impl->set_kernel_ids(kernel_ids); + } }; // Fuse reorders into primitives diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 19d7213ec58..370aa55e04c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -50,12 +50,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; _kernel_data.weightsReorderParams.cpuKernel = nullptr; _kernel_data.weightsReorderParams.clKernel = nullptr; - - _kernel_ids.reserve(kd.kernels.size()); - // Add selected kernels to kernels_cache for the following compilation and save output ids - for (size_t i = 0; i < kd.kernels.size(); ++i) { - _kernel_ids.emplace_back(arg.get_program().add_kernel(kd.kernels[i].code.kernelString)); - } } bool is_cpu() const override { return false; } @@ -198,6 +192,18 @@ protected: bool group_events = (all_events.size() > 1); return aggregate_events(all_events, stream, group_events); } + + void set_kernel_ids(std::vector kernel_ids) override { + _kernel_ids = kernel_ids; + } + + std::vector> get_kernels_source() override { + std::vector> kernel_strings; + for (size_t i = 0; i < _kernel_data.kernels.size(); ++i) { + kernel_strings.push_back(_kernel_data.kernels[i].code.kernelString); + } + return kernel_strings; + } }; } // namespace ocl diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index d10d76a942c..6e0d9c9c054 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -54,6 +54,9 @@ struct primitive_impl { virtual std::vector get_kernel_ids() { return {}; } + virtual std::vector> get_kernels_source() { return {}; } + virtual void set_kernels(std::vector) {} + virtual void set_kernel_ids(std::vector kernel_ids) {} // If this flag is set as false, the memory allocated for this primitive is not allowed to be reused bool can_reuse_memory = true; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 54785b67d20..665ebcfc18a 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -31,6 +31,7 @@ #include "kernel_selector_helper.h" #include "program_helpers.h" #include "runtime/cldnn_itt.hpp" +#include "kernels_cache.hpp" #include #include @@ -293,6 +294,13 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo build_exec_order(); validate_primitives(); add_default_output_chains(); + + if (is_dynamic()) { + _kernels_cache = std::unique_ptr(new kernels_cache(program->get_engine(), program->get_id(), + kernel_selector::KernelBase::get_db().get_batch_header_str())); + _impls_cache = std::unique_ptr(new ImplementationsCache(_impls_cache_capacity)); + _in_mem_kernels_cache = std::unique_ptr(new KernelsCache(_in_mem_kernels_cache_capacity)); + } } network::network(engine& engine, diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 9de819e7a81..7487cc75ef4 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -286,17 +286,19 @@ void primitive_inst::update_impl() { }; auto layout_key = get_layout_key(); - auto& cache = _network.get_program()->get_implementations_cache(); + auto& cache = get_network().get_implementations_cache(); if (cache.has(layout_key)) { _impl = cache.get(layout_key)->clone(); GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); } else { - auto lru = cache.get_lru_element(); _impl = _node->type()->choose_impl(*_node, *_impl_params); - _network.get_program()->compile(); - _impl->init_kernels(_network.get_program()->get_kernels_cache()); + auto& kernels_cache = get_network().get_kernels_cache(); + auto kernel_ids = kernels_cache.add_kernels_source(_impl->get_kernels_source()); + _impl->set_kernel_ids(kernel_ids); + kernels_cache.compile(); + _impl->init_kernels(kernels_cache); cache.add(layout_key, _impl->clone()); - _network.get_program()->get_kernels_cache().reset(); + kernels_cache.reset(); } reset_shape_change(); @@ -560,8 +562,6 @@ event::ptr primitive_inst::update_weights() { auto original_weights_memory = dep_memory_ptr(weights_idx); auto original_layout = original_weights_memory->get_layout(); layout expected_layout = from_weights_tensor(weights_params.dest); - - auto& program = _node->get_program(); auto& engine = _network.get_engine(); auto get_layout_key = [&]() -> std::string { @@ -574,7 +574,7 @@ event::ptr primitive_inst::update_weights() { cldnn::kernel::ptr kernel = nullptr; auto layout_key = get_layout_key(); if (layout_key != "") { - auto& cache = program.get_in_mem_kernels_cache(); + auto& cache = get_network().get_in_mem_kernels_cache(); if (cache.has(layout_key)) { GPU_DEBUG_IF(debug_config->verbose >= 4) { GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout << "\nto " << expected_layout << std::endl; @@ -585,14 +585,16 @@ event::ptr primitive_inst::update_weights() { GPU_DEBUG_IF(debug_config->verbose >= 4) { GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout << "\nto " << expected_layout << std::endl; } - auto _kernel_id = program.add_kernel(weights_params.clKernel->code.kernelString); - program.compile(); - kernel = program.get_kernel(_kernel_id); + auto& kernels_cache = get_network().get_kernels_cache(); + auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false); + kernels_cache.compile(); + kernel = kernels_cache.get_kernel(kernel_id); cache.add(layout_key, kernel); + kernels_cache.reset(); } } - auto& stream = _network.get_stream(); + auto& stream = get_network().get_stream(); bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count(); if (can_reuse) { diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index a31d07e51d3..05031a0f3dc 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -114,8 +114,6 @@ program::program(engine& engine_ref, prepare_nodes(topology); _kernels_cache = std::unique_ptr(new kernels_cache(_engine, prog_id, kernel_selector::KernelBase::get_db().get_batch_header_str())); - _impls_cache = std::unique_ptr(new ImplementationsCache(_impls_cache_capacity)); - _in_mem_kernels_cache = std::unique_ptr(new KernelsCache(_in_mem_kernels_cache_capacity)); program_node::reset_unique_id(); if (no_optimizations) { init_graph(); @@ -137,8 +135,6 @@ program::program(engine& engine_ref, set_options(); _kernels_cache = std::unique_ptr(new kernels_cache(_engine, prog_id, kernel_selector::KernelBase::get_db().get_batch_header_str())); - _impls_cache = std::unique_ptr(new ImplementationsCache(_impls_cache_capacity)); - _in_mem_kernels_cache = std::unique_ptr(new KernelsCache(_in_mem_kernels_cache_capacity)); pm = std::unique_ptr(new pass_manager(*this)); prepare_nodes(nodes); build_program(is_internal); diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index 98e05c7083e..62220631ddf 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -148,18 +148,8 @@ kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, const std::vector kernel_id kernels_cache::set_kernel_source( const std::shared_ptr& kernel_string, bool dump_custom_program) { - std::lock_guard lock(_mutex); - // we need unique id in order to avoid conflict across topologies. - const auto kernel_num = _kernels.size() + (_kernel_idx++); - kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num); - - auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program); - - assert(_kernels.find(id) == _kernels.end()); - if (res.second) { - _pending_compilation = true; - } - return id; + auto kernel_ids = add_kernels_source({kernel_string}, dump_custom_program); + return kernel_ids[0]; } static std::vector getProgramBinaries(cl::Program program) { @@ -384,4 +374,54 @@ void kernels_cache::reset() { _pending_compilation = false; } +std::vector kernels_cache::add_kernels_source(std::vector> kernel_sources, bool dump_custom_program) { + std::vector kernel_ids; + kernel_ids.reserve(kernel_sources.size()); + for (size_t i = 0; i < kernel_sources.size(); ++i) { + std::lock_guard lock(_mutex); + auto kernel_string = kernel_sources[i]; + // we need unique id in order to avoid conflict across topologies. + const auto kernel_num = _kernels.size() + (_kernel_idx++); + kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num); + + auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program); + + assert(_kernels.find(id) == _kernels.end()); + if (res.second) { + _pending_compilation = true; + } + kernel_ids.emplace_back(id); + } + return kernel_ids; +} + +void kernels_cache::compile() { + OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll"); + + std::unique_ptr _build_engine = nullptr; + if (_engine.type() == engine_types::ocl) { + _build_engine = std::unique_ptr(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl, + _engine.configuration(), _engine.get_task_executor())); + } + + // create batches + std::vector batches; + get_program_source(_kernels_code, &batches); + + // build batches + for (size_t idx = 0; idx < batches.size(); idx++) { + build_batch(*_build_engine, batches[idx]); + } + + _kernels_code.clear(); + _pending_compilation = false; +#if defined(__unix__) && !defined(__ANDROID__) + // NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed. + // (It is at least 500 MB when we perform parallel compilation) + // It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory. + // Also, this is not happening in Windows. + // So, added malloc_trim for linux build until we figure out a better solution. + malloc_trim(0); +#endif +} } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp index 8404b49b0ac..d3fc6ac4b9b 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp @@ -101,6 +101,8 @@ public: void remove_kernel(kernel_id id) { _kernels.erase(id); } + std::vector add_kernels_source(std::vector> kernel_sources, bool dump_custom_program = false); + void compile(); }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp new file mode 100644 index 00000000000..7bea1dc0cf9 --- /dev/null +++ b/src/plugins/intel_gpu/tests/test_cases/multiple_streams_gpu_test.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2022-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "test_utils.h" + +#include +#include +#include +#include +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; + + +TEST(multistream_gpu, basic) { + const int num_streams = 2; + auto config = InferenceEngine::CPUStreamsExecutor::Config(); + config._streams = num_streams; + auto task_executor = std::make_shared(config); + auto& engine = get_test_engine(); + + build_options bo; + bo.set_option(build_option::allow_new_shape_infer(true)); + + auto input1_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx }; + auto input2_dyn_layout = layout{ ov::PartialShape::dynamic(3), data_types::f16,format::bfyx }; + auto weights = engine.allocate_memory({ {512, 512}, data_types::f32, format::bfyx}); + + topology topology; + topology.add(input_layout("input1", input1_dyn_layout)); + topology.add(input_layout("input2", input2_dyn_layout)); + topology.add(data("weights", weights)); + topology.add(eltwise("eltwise", "input1", "input2", eltwise_mode::sum)); + topology.add(fully_connected("fc", "eltwise", "weights")); + topology.add(shape_of("shape_of", "fc", 3, data_types::i32)); + + auto prog_ptr = program::build_program(engine, topology, bo); + std::vector networks; + for (size_t i = 0; i < num_streams; i++) { + networks.push_back(network::allocate_network(engine, prog_ptr)); + } + + std::vector tasks; + for (size_t i = 0; i < num_streams; i++) { + tasks.push_back([&networks, i, &engine] { + auto net = networks[i]; + std::vector various_size = {32, 128, 16, 64}; + for (size_t iter = 0; iter < 8; iter++) { + int len = various_size[iter % various_size.size()]; + auto input1_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx }); + auto input2_mem = engine.allocate_memory({ ov::PartialShape{1,len,512}, data_types::f16,format::bfyx }); + net->set_input_data("input1", input1_mem); + net->set_input_data("input2", input2_mem); + + auto outputs = net->execute(); + + auto output = outputs.at("shape_of").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + std::vector expected_results = {1, len, 512}; + + for (size_t out_idx = 0; out_idx < expected_results.size(); ++out_idx) { + EXPECT_TRUE(are_equal(expected_results[out_idx], output_ptr[out_idx])); + } + } + }); + } + + task_executor->runAndWait(tasks); + tasks.clear(); + networks.clear(); +}