[GPU] Do weight reorder for fc shape agnostic kernels at build time (#18829)

* Do weight reorder at build time * Add test
2023-07-31 20:40:06 -07:00 · 2023-07-31 20:40:06 -07:00 · 974ef62ce6
commit 974ef62ce6
parent 5813b6d27a
9 changed files with 133 additions and 10 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@ -151,6 +151,7 @@ public:
        return outputs;
    }  // ToDo: redesign reorder-inputs pass to make it const as_well as get_engine and get options
    bool is_loop_body() const { return is_body_program; }
+    bool is_internal_program() const { return is_internal; }
    const nodes_ordering& get_processing_order() const;
    nodes_ordering& get_processing_order();
    uint32_t get_prog_id() { return prog_id; }
@ -278,6 +279,7 @@ private:
    std::vector<program_node*> outputs;
    nodes_ordering processing_order;
    std::unique_ptr<pass_manager> pm;
+    bool is_internal;
    bool is_body_program;
    std::unique_ptr<ImplementationsCache> _impls_cache;
    const size_t _impls_cache_capacity = 10000;
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp
@ -49,6 +49,7 @@ struct WeightsReorderParams {
    bool get_grouped() const { return _grouped; }

    void set_input_layout(const layout& layout) { _in_layout = layout; }
+    void set_output_layout(const layout& layout) { _out_layout = layout; }

 protected:
    layout _in_layout;
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -122,7 +122,8 @@ public:
    int disable_async_compilation;                  // Disable async compilation
    int disable_dynamic_impl;                       // Disable dynamic implementation
    int disable_runtime_buffer_fusing;              // Disable runtime buffer fusing
-    int disable_memory_reuse;                   // Disable memmory reuse among layers
+    int disable_memory_reuse;                       // Disable memmory reuse among layers
+    int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
    std::set<int64_t> dump_iteration;               // Dump n-th execution of network.
    std::vector<std::string> load_layers_raw_dump;  // List of layers to load dumped raw binary and filenames
    static const debug_configuration *get_instance();
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@ -37,9 +37,24 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
    if (!impl)
        return;

-    if (impl->is_dynamic())
-        return;
-
+    if (impl->is_dynamic()) {
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->disable_build_time_weight_reorder_for_dynamic_nodes) {
+            return;
+        }
+        // TODO: To relax current limitation w.r.t the future optimization of weight reorder process
+        // In dynamic shape, selected weight format can change in runtime. However reordering blocked format to blocked format is not fully verified yet.
+        // So we need to enable other primiives such as convolution with verifying reorder b/w the possible layouts
+        // Also we skip weight reorder for onednn impl because onednn fully connected layer is using simple format, therefore
+        // reordering to cldnn shape_agnostic_kernel's preferred blocked format at build time does not helpful for the performance.
+        // This situation might be changed once onednn shape agnostic kernel is used in the future.
+        if (p.is_internal_program())
+            return;
+        if (node.get_preferred_impl_type() == impl_types::onednn)
+            return;
+        if (node.type() != fully_connected::type_id())
+            return;
+    }
    // Don't run impl selection to avoid double compilation of reorder kernels
    // in main program and internal program for constant propagation
    auto set_implementation = [&p, &impl](program_node& weights_reorder_node) {
@ -69,13 +84,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
                                !prev_node.has_fused_primitives() &&
                                !prev_node.as<reorder>().has_mean() &&
                                prev_node.as<reorder>().get_primitive()->subtract_per_feature.empty();
+            if (impl->is_dynamic()) {
+                if (weights_reorder_params->get_output_layout().compatible(prev_node.get_output_layout())) {
+                    // if compatible, it can be reinterpreted, thus no need to reorder at build time
+                    continue;
+                }
+                // Need to restore the original shape
+                auto updated_output_layout = weights_reorder_params->get_output_layout();
+                auto orig_rank = prev_node.get_output_layout().get_partial_shape().size();
+                auto weight_format_dims = format::dimension(weights_reorder_params->get_output_layout().format);
+                updated_output_layout.set_partial_shape(
+                    updated_output_layout.get_tensor().get_partial_shape(orig_rank, weight_format_dims));
+                if (updated_output_layout != weights_reorder_params->get_output_layout())
+                    weights_reorder_params->set_output_layout(updated_output_layout);
+            }
            if (can_be_fused) {
                // Need to update input data_type for correct merging format reorder with precision reorder
                data_types input_dtype = prev_node.get_input_layouts()[0].data_type;
                auto updated_input_layout = weights_reorder_params->get_input_layout();
                updated_input_layout.data_type = input_dtype;
                weights_reorder_params->set_input_layout(updated_input_layout);
-
                auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid,
                                                               weights_reorder_params);
                auto& weights_reorder_node = p.get_or_create(weights_reorder.first);
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -157,6 +157,7 @@ program::program(engine& engine_ref,
      _config(config),
      _task_executor(task_executor),
      processing_order(),
+      is_internal(is_internal),
      is_body_program(is_body_program) {
    _config.apply_user_properties(_engine.get_device_info());
    init_primitives();
@ -181,7 +182,8 @@ program::program(engine& engine_ref,
      _stream(_engine.create_stream(config)),
      _config(config),
      _task_executor(task_executor),
-      processing_order() {
+      processing_order(),
+      is_internal(is_internal) {
    _config.apply_user_properties(_engine.get_device_info());
    init_primitives();
    init_program();
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -188,7 +188,8 @@ debug_configuration::debug_configuration()
        , disable_async_compilation(0)
        , disable_dynamic_impl(0)
        , disable_runtime_buffer_fusing(0)
-        , disable_memory_reuse(0) {
+        , disable_memory_reuse(0)
+        , disable_build_time_weight_reorder_for_dynamic_nodes(0) {
 #ifdef GPU_DEBUG_CONFIG
    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
@ -222,6 +223,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl);
    get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
    get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
+    get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
    std::string dump_iteration_str;
    get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
    std::string mem_preallocation_params_str;
--- a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
@ -63,8 +63,11 @@ TEST(kernel_impl_params_relevance, weights_layout) {
    auto fc_inst = std::dynamic_pointer_cast<fully_connected_inst>(inst);
    ASSERT_TRUE(fc_inst != nullptr);

-    // 6. Requset instance's weights memory, compare it with original weights buffer and check
+    // 6. The weight memory of fc node is reordered at build time for fully_connected_gpu_bf_tiled kernel
+    ASSERT_EQ(fc_inst->get_node().get_dependency(1).get_output_layout().format, format::os_iyx_osv16);
+
+    // 7. Requset instance's weights memory, compare it with original weights buffer and check
    //    if original layout is used (required for `fully_connected_gpu_bfyx_ref` kernel)
    auto used_weights_memory = fc_inst->weights_memory()->get_layout();
-    ASSERT_EQ(weights_data->get_layout(), used_weights_memory);
+    ASSERT_EQ(weights_data->get_layout().compatible(used_weights_memory), true);
 }
--- a/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp
@ -35,6 +35,36 @@ TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) {
    ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
 }

+TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test_dynamic) {
+    auto& engine = get_test_engine();
+    if (engine.get_device_info().supports_immad)
+        return;
+
+    auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx });
+
+    auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx };
+
+    topology topology(
+        input_layout("input", in_layout),
+        input_layout("weights", weights->get_layout()),
+        reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16),
+        fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16, {}, 3)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    auto prog = program::build_program(engine, topology, config, false, true);
+
+    reorder_factory rf;
+    program_wrapper::apply_opt_pass<compile_graph>(*prog);
+    program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
+
+    ASSERT_TRUE(has_node(*prog, "reorder_dt"));
+    ASSERT_NE(prog->get_node("fc").get_selected_impl(), nullptr);
+    ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
+}
+
 TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
    auto& engine = get_test_engine();

@ -85,3 +115,55 @@ TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
        ASSERT_EQ(weights_mem[i], expected[i]);
    }
 }
+
+TEST(post_optimize_weights, weights_reorder_constant_folding_test_dynamic) {
+    auto& engine = get_test_engine();
+    if (engine.get_device_info().supports_immad)
+        return;
+    ov::Shape pshape = { 4, 32 };
+    auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx };
+    auto weights = engine.allocate_memory({pshape, data_types::f32, format::bfyx });
+
+    std::vector<float> weights_data(pshape[0] * pshape[1]);
+    std::iota(weights_data.begin(), weights_data.end(), 0.f);
+    set_values(weights, weights_data);
+
+    topology topology(
+        input_layout("input", in_layout),
+        data("weights", weights),
+        fully_connected("fc", input_info("input"), { "weights" }, "", data_types::f16, {}, 3)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    auto prog = program::build_program(engine, topology, config, false, true);
+
+    reorder_factory rf;
+    program_wrapper::apply_opt_pass<compile_graph>(*prog);
+    program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
+    program_wrapper::apply_opt_pass<propagate_constants>(*prog);
+
+    ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0"));
+    auto& weights_node = prog->get_node("weights_weights_reorder_0");
+    ASSERT_TRUE(weights_node.is_type<data>());
+
+    size_t align = 16; // os_iyx_osv16 format
+    size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0]
+                                                   : pshape[0] - pshape[0] % align + align;
+    std::vector<float> expected(aligned_b_size * pshape[1], 0.f);
+    size_t input_idx = 0;
+    for (size_t i = 0; i < pshape[0]; ++i) {
+        for (size_t j = 0; j < pshape[1]; ++j) {
+            expected[j * align + i] = weights_data[input_idx++];
+        }
+    }
+
+    auto weights_mem_ptr = weights_node.as<data>().get_attached_memory_ptr();
+    cldnn::mem_lock<float, mem_lock_type::read> weights_mem(weights_mem_ptr, get_test_stream());
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        ASSERT_EQ(weights_mem[i], expected[i]);
+    }
+}
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@ -2512,7 +2512,9 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {
    auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
    ASSERT_TRUE(reorder_kernel_params != nullptr);
    auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params);
-    ASSERT_TRUE(reorder_impl != nullptr);
+    // cldnn shape agnostic kernel reorder is done in build time
+    // therefore the reorder is no longer in cache, but the program_node of weight data is in the preferred format
+    ASSERT_TRUE(reorder_impl == nullptr);

    auto out_l = network.get_output_layout(outputs.begin()->first);
    ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment