From 974ef62ce6346baa336bb9ac1591b1184c0ed37e Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Mon, 31 Jul 2023 20:40:06 -0700 Subject: [PATCH] [GPU] Do weight reorder for fc shape agnostic kernels at build time (#18829) * Do weight reorder at build time * Add test --- .../include/intel_gpu/graph/program.hpp | 2 + .../include/intel_gpu/primitives/reorder.hpp | 1 + .../intel_gpu/runtime/debug_configuration.hpp | 3 +- .../graph_optimizer/post_optimize_weights.cpp | 36 +++++++- src/plugins/intel_gpu/src/graph/program.cpp | 4 +- .../src/runtime/debug_configuration.cpp | 4 +- .../kernel_impl_params_relevance_test.cpp | 7 +- .../unit/passes/post_optimize_weights.cpp | 82 +++++++++++++++++++ .../test_cases/fully_connected_gpu_test.cpp | 4 +- 9 files changed, 133 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index 16cdd2620e5..746028e26cd 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -151,6 +151,7 @@ public: return outputs; } // ToDo: redesign reorder-inputs pass to make it const as_well as get_engine and get options bool is_loop_body() const { return is_body_program; } + bool is_internal_program() const { return is_internal; } const nodes_ordering& get_processing_order() const; nodes_ordering& get_processing_order(); uint32_t get_prog_id() { return prog_id; } @@ -278,6 +279,7 @@ private: std::vector outputs; nodes_ordering processing_order; std::unique_ptr pm; + bool is_internal; bool is_body_program; std::unique_ptr _impls_cache; const size_t _impls_cache_capacity = 10000; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp index 0e18063c443..1c59eebe657 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp @@ -49,6 +49,7 @@ struct WeightsReorderParams { bool get_grouped() const { return _grouped; } void set_input_layout(const layout& layout) { _in_layout = layout; } + void set_output_layout(const layout& layout) { _out_layout = layout; } protected: layout _in_layout; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 431ffab9f1b..fcafd336558 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -122,7 +122,8 @@ public: int disable_async_compilation; // Disable async compilation int disable_dynamic_impl; // Disable dynamic implementation int disable_runtime_buffer_fusing; // Disable runtime buffer fusing - int disable_memory_reuse; // Disable memmory reuse among layers + int disable_memory_reuse; // Disable memmory reuse among layers + int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes std::set dump_iteration; // Dump n-th execution of network. std::vector load_layers_raw_dump; // List of layers to load dumped raw binary and filenames static const debug_configuration *get_instance(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index 06b50782588..3790b299a25 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -37,9 +37,24 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { if (!impl) return; - if (impl->is_dynamic()) - return; - + if (impl->is_dynamic()) { + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->disable_build_time_weight_reorder_for_dynamic_nodes) { + return; + } + // TODO: To relax current limitation w.r.t the future optimization of weight reorder process + // In dynamic shape, selected weight format can change in runtime. However reordering blocked format to blocked format is not fully verified yet. + // So we need to enable other primiives such as convolution with verifying reorder b/w the possible layouts + // Also we skip weight reorder for onednn impl because onednn fully connected layer is using simple format, therefore + // reordering to cldnn shape_agnostic_kernel's preferred blocked format at build time does not helpful for the performance. + // This situation might be changed once onednn shape agnostic kernel is used in the future. + if (p.is_internal_program()) + return; + if (node.get_preferred_impl_type() == impl_types::onednn) + return; + if (node.type() != fully_connected::type_id()) + return; + } // Don't run impl selection to avoid double compilation of reorder kernels // in main program and internal program for constant propagation auto set_implementation = [&p, &impl](program_node& weights_reorder_node) { @@ -69,13 +84,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { !prev_node.has_fused_primitives() && !prev_node.as().has_mean() && prev_node.as().get_primitive()->subtract_per_feature.empty(); + if (impl->is_dynamic()) { + if (weights_reorder_params->get_output_layout().compatible(prev_node.get_output_layout())) { + // if compatible, it can be reinterpreted, thus no need to reorder at build time + continue; + } + // Need to restore the original shape + auto updated_output_layout = weights_reorder_params->get_output_layout(); + auto orig_rank = prev_node.get_output_layout().get_partial_shape().size(); + auto weight_format_dims = format::dimension(weights_reorder_params->get_output_layout().format); + updated_output_layout.set_partial_shape( + updated_output_layout.get_tensor().get_partial_shape(orig_rank, weight_format_dims)); + if (updated_output_layout != weights_reorder_params->get_output_layout()) + weights_reorder_params->set_output_layout(updated_output_layout); + } if (can_be_fused) { // Need to update input data_type for correct merging format reorder with precision reorder data_types input_dtype = prev_node.get_input_layouts()[0].data_type; auto updated_input_layout = weights_reorder_params->get_input_layout(); updated_input_layout.data_type = input_dtype; weights_reorder_params->set_input_layout(updated_input_layout); - auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid, weights_reorder_params); auto& weights_reorder_node = p.get_or_create(weights_reorder.first); diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 285ead0ae7a..f49ce55cf83 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -157,6 +157,7 @@ program::program(engine& engine_ref, _config(config), _task_executor(task_executor), processing_order(), + is_internal(is_internal), is_body_program(is_body_program) { _config.apply_user_properties(_engine.get_device_info()); init_primitives(); @@ -181,7 +182,8 @@ program::program(engine& engine_ref, _stream(_engine.create_stream(config)), _config(config), _task_executor(task_executor), - processing_order() { + processing_order(), + is_internal(is_internal) { _config.apply_user_properties(_engine.get_device_info()); init_primitives(); init_program(); diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 86c620d9646..75cdaa176a6 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -188,7 +188,8 @@ debug_configuration::debug_configuration() , disable_async_compilation(0) , disable_dynamic_impl(0) , disable_runtime_buffer_fusing(0) - , disable_memory_reuse(0) { + , disable_memory_reuse(0) + , disable_build_time_weight_reorder_for_dynamic_nodes(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); @@ -222,6 +223,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl); get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing); get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse); + get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes); std::string dump_iteration_str; get_gpu_debug_env_var("DumpIteration", dump_iteration_str); std::string mem_preallocation_params_str; diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp index c2c8e7d295b..abfbabce9c9 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp @@ -63,8 +63,11 @@ TEST(kernel_impl_params_relevance, weights_layout) { auto fc_inst = std::dynamic_pointer_cast(inst); ASSERT_TRUE(fc_inst != nullptr); - // 6. Requset instance's weights memory, compare it with original weights buffer and check + // 6. The weight memory of fc node is reordered at build time for fully_connected_gpu_bf_tiled kernel + ASSERT_EQ(fc_inst->get_node().get_dependency(1).get_output_layout().format, format::os_iyx_osv16); + + // 7. Requset instance's weights memory, compare it with original weights buffer and check // if original layout is used (required for `fully_connected_gpu_bfyx_ref` kernel) auto used_weights_memory = fc_inst->weights_memory()->get_layout(); - ASSERT_EQ(weights_data->get_layout(), used_weights_memory); + ASSERT_EQ(weights_data->get_layout().compatible(used_weights_memory), true); } diff --git a/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp index b205f21938f..1a51a2a46a2 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp @@ -35,6 +35,36 @@ TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) { ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format)); } +TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test_dynamic) { + auto& engine = get_test_engine(); + if (engine.get_device_info().supports_immad) + return; + + auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx }); + + auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx }; + + topology topology( + input_layout("input", in_layout), + input_layout("weights", weights->get_layout()), + reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16), + fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16, {}, 3) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + auto prog = program::build_program(engine, topology, config, false, true); + + reorder_factory rf; + program_wrapper::apply_opt_pass(*prog); + program_wrapper::apply_opt_pass(*prog, rf); + + ASSERT_TRUE(has_node(*prog, "reorder_dt")); + ASSERT_NE(prog->get_node("fc").get_selected_impl(), nullptr); + ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format)); +} + TEST(post_optimize_weights, weights_reorder_constant_folding_test) { auto& engine = get_test_engine(); @@ -85,3 +115,55 @@ TEST(post_optimize_weights, weights_reorder_constant_folding_test) { ASSERT_EQ(weights_mem[i], expected[i]); } } + +TEST(post_optimize_weights, weights_reorder_constant_folding_test_dynamic) { + auto& engine = get_test_engine(); + if (engine.get_device_info().supports_immad) + return; + ov::Shape pshape = { 4, 32 }; + auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx }; + auto weights = engine.allocate_memory({pshape, data_types::f32, format::bfyx }); + + std::vector weights_data(pshape[0] * pshape[1]); + std::iota(weights_data.begin(), weights_data.end(), 0.f); + set_values(weights, weights_data); + + topology topology( + input_layout("input", in_layout), + data("weights", weights), + fully_connected("fc", input_info("input"), { "weights" }, "", data_types::f16, {}, 3) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + auto prog = program::build_program(engine, topology, config, false, true); + + reorder_factory rf; + program_wrapper::apply_opt_pass(*prog); + program_wrapper::apply_opt_pass(*prog, rf); + program_wrapper::apply_opt_pass(*prog); + + ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0")); + auto& weights_node = prog->get_node("weights_weights_reorder_0"); + ASSERT_TRUE(weights_node.is_type()); + + size_t align = 16; // os_iyx_osv16 format + size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0] + : pshape[0] - pshape[0] % align + align; + std::vector expected(aligned_b_size * pshape[1], 0.f); + size_t input_idx = 0; + for (size_t i = 0; i < pshape[0]; ++i) { + for (size_t j = 0; j < pshape[1]; ++j) { + expected[j * align + i] = weights_data[input_idx++]; + } + } + + auto weights_mem_ptr = weights_node.as().get_attached_memory_ptr(); + cldnn::mem_lock weights_mem(weights_mem_ptr, get_test_stream()); + + for (size_t i = 0; i < expected.size(); ++i) { + ASSERT_EQ(weights_mem[i], expected[i]); + } +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index af69b7bc0cb..d96182b974b 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -2512,7 +2512,9 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) { auto reorder_kernel_params = impl->get_weights_reorder_kernel_params(); ASSERT_TRUE(reorder_kernel_params != nullptr); auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params); - ASSERT_TRUE(reorder_impl != nullptr); + // cldnn shape agnostic kernel reorder is done in build time + // therefore the reorder is no longer in cache, but the program_node of weight data is in the preferred format + ASSERT_TRUE(reorder_impl == nullptr); auto out_l = network.get_output_layout(outputs.begin()->first); ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment