From 974ef62ce6346baa336bb9ac1591b1184c0ed37e Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Mon, 31 Jul 2023 20:40:06 -0700
Subject: [PATCH] [GPU] Do weight reorder for fc shape agnostic kernels at
 build time (#18829)

* Do weight reorder at build time

* Add test
---
 .../include/intel_gpu/graph/program.hpp       |  2 +
 .../include/intel_gpu/primitives/reorder.hpp  |  1 +
 .../intel_gpu/runtime/debug_configuration.hpp |  3 +-
 .../graph_optimizer/post_optimize_weights.cpp | 36 +++++++-
 src/plugins/intel_gpu/src/graph/program.cpp   |  4 +-
 .../src/runtime/debug_configuration.cpp       |  4 +-
 .../kernel_impl_params_relevance_test.cpp     |  7 +-
 .../unit/passes/post_optimize_weights.cpp     | 82 +++++++++++++++++++
 .../test_cases/fully_connected_gpu_test.cpp   |  4 +-
 9 files changed, 133 insertions(+), 10 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
index 16cdd2620e5..746028e26cd 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@@ -151,6 +151,7 @@ public:
         return outputs;
     }  // ToDo: redesign reorder-inputs pass to make it const as_well as get_engine and get options
     bool is_loop_body() const { return is_body_program; }
+    bool is_internal_program() const { return is_internal; }
     const nodes_ordering& get_processing_order() const;
     nodes_ordering& get_processing_order();
     uint32_t get_prog_id() { return prog_id; }
@@ -278,6 +279,7 @@ private:
     std::vector<program_node*> outputs;
     nodes_ordering processing_order;
     std::unique_ptr<pass_manager> pm;
+    bool is_internal;
     bool is_body_program;
     std::unique_ptr<ImplementationsCache> _impls_cache;
     const size_t _impls_cache_capacity = 10000;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp
index 0e18063c443..1c59eebe657 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp
@@ -49,6 +49,7 @@ struct WeightsReorderParams {
     bool get_grouped() const { return _grouped; }
 
     void set_input_layout(const layout& layout) { _in_layout = layout; }
+    void set_output_layout(const layout& layout) { _out_layout = layout; }
 
 protected:
     layout _in_layout;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index 431ffab9f1b..fcafd336558 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -122,7 +122,8 @@ public:
     int disable_async_compilation;                  // Disable async compilation
     int disable_dynamic_impl;                       // Disable dynamic implementation
     int disable_runtime_buffer_fusing;              // Disable runtime buffer fusing
-    int disable_memory_reuse;                   // Disable memmory reuse among layers
+    int disable_memory_reuse;                       // Disable memmory reuse among layers
+    int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
     std::set<int64_t> dump_iteration;               // Dump n-th execution of network.
     std::vector<std::string> load_layers_raw_dump;  // List of layers to load dumped raw binary and filenames
     static const debug_configuration *get_instance();
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
index 06b50782588..3790b299a25 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@@ -37,9 +37,24 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
     if (!impl)
         return;
 
-    if (impl->is_dynamic())
-        return;
-
+    if (impl->is_dynamic()) {
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        GPU_DEBUG_IF(debug_config->disable_build_time_weight_reorder_for_dynamic_nodes) {
+            return;
+        }
+        // TODO: To relax current limitation w.r.t the future optimization of weight reorder process
+        // In dynamic shape, selected weight format can change in runtime. However reordering blocked format to blocked format is not fully verified yet.
+        // So we need to enable other primiives such as convolution with verifying reorder b/w the possible layouts
+        // Also we skip weight reorder for onednn impl because onednn fully connected layer is using simple format, therefore
+        // reordering to cldnn shape_agnostic_kernel's preferred blocked format at build time does not helpful for the performance.
+        // This situation might be changed once onednn shape agnostic kernel is used in the future.
+        if (p.is_internal_program())
+            return;
+        if (node.get_preferred_impl_type() == impl_types::onednn)
+            return;
+        if (node.type() != fully_connected::type_id())
+            return;
+    }
     // Don't run impl selection to avoid double compilation of reorder kernels
     // in main program and internal program for constant propagation
     auto set_implementation = [&p, &impl](program_node& weights_reorder_node) {
@@ -69,13 +84,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
                                 !prev_node.has_fused_primitives() &&
                                 !prev_node.as<reorder>().has_mean() &&
                                 prev_node.as<reorder>().get_primitive()->subtract_per_feature.empty();
+            if (impl->is_dynamic()) {
+                if (weights_reorder_params->get_output_layout().compatible(prev_node.get_output_layout())) {
+                    // if compatible, it can be reinterpreted, thus no need to reorder at build time
+                    continue;
+                }
+                // Need to restore the original shape
+                auto updated_output_layout = weights_reorder_params->get_output_layout();
+                auto orig_rank = prev_node.get_output_layout().get_partial_shape().size();
+                auto weight_format_dims = format::dimension(weights_reorder_params->get_output_layout().format);
+                updated_output_layout.set_partial_shape(
+                    updated_output_layout.get_tensor().get_partial_shape(orig_rank, weight_format_dims));
+                if (updated_output_layout != weights_reorder_params->get_output_layout())
+                    weights_reorder_params->set_output_layout(updated_output_layout);
+            }
             if (can_be_fused) {
                 // Need to update input data_type for correct merging format reorder with precision reorder
                 data_types input_dtype = prev_node.get_input_layouts()[0].data_type;
                 auto updated_input_layout = weights_reorder_params->get_input_layout();
                 updated_input_layout.data_type = input_dtype;
                 weights_reorder_params->set_input_layout(updated_input_layout);
-
                 auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid,
                                                                weights_reorder_params);
                 auto& weights_reorder_node = p.get_or_create(weights_reorder.first);
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 285ead0ae7a..f49ce55cf83 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -157,6 +157,7 @@ program::program(engine& engine_ref,
       _config(config),
       _task_executor(task_executor),
       processing_order(),
+      is_internal(is_internal),
       is_body_program(is_body_program) {
     _config.apply_user_properties(_engine.get_device_info());
     init_primitives();
@@ -181,7 +182,8 @@ program::program(engine& engine_ref,
       _stream(_engine.create_stream(config)),
       _config(config),
       _task_executor(task_executor),
-      processing_order() {
+      processing_order(),
+      is_internal(is_internal) {
     _config.apply_user_properties(_engine.get_device_info());
     init_primitives();
     init_program();
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index 86c620d9646..75cdaa176a6 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -188,7 +188,8 @@ debug_configuration::debug_configuration()
         , disable_async_compilation(0)
         , disable_dynamic_impl(0)
         , disable_runtime_buffer_fusing(0)
-        , disable_memory_reuse(0) {
+        , disable_memory_reuse(0)
+        , disable_build_time_weight_reorder_for_dynamic_nodes(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
@@ -222,6 +223,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl);
     get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
     get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
+    get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
     std::string dump_iteration_str;
     get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
     std::string mem_preallocation_params_str;
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
index c2c8e7d295b..abfbabce9c9 100644
--- a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp
@@ -63,8 +63,11 @@ TEST(kernel_impl_params_relevance, weights_layout) {
     auto fc_inst = std::dynamic_pointer_cast<fully_connected_inst>(inst);
     ASSERT_TRUE(fc_inst != nullptr);
 
-    // 6. Requset instance's weights memory, compare it with original weights buffer and check
+    // 6. The weight memory of fc node is reordered at build time for fully_connected_gpu_bf_tiled kernel
+    ASSERT_EQ(fc_inst->get_node().get_dependency(1).get_output_layout().format, format::os_iyx_osv16);
+
+    // 7. Requset instance's weights memory, compare it with original weights buffer and check
     //    if original layout is used (required for `fully_connected_gpu_bfyx_ref` kernel)
     auto used_weights_memory = fc_inst->weights_memory()->get_layout();
-    ASSERT_EQ(weights_data->get_layout(), used_weights_memory);
+    ASSERT_EQ(weights_data->get_layout().compatible(used_weights_memory), true);
 }
diff --git a/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp
index b205f21938f..1a51a2a46a2 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp
@@ -35,6 +35,36 @@ TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) {
     ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
 }
 
+TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test_dynamic) {
+    auto& engine = get_test_engine();
+    if (engine.get_device_info().supports_immad)
+        return;
+
+    auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx });
+
+    auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx };
+
+    topology topology(
+        input_layout("input", in_layout),
+        input_layout("weights", weights->get_layout()),
+        reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16),
+        fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16, {}, 3)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    auto prog = program::build_program(engine, topology, config, false, true);
+
+    reorder_factory rf;
+    program_wrapper::apply_opt_pass<compile_graph>(*prog);
+    program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
+
+    ASSERT_TRUE(has_node(*prog, "reorder_dt"));
+    ASSERT_NE(prog->get_node("fc").get_selected_impl(), nullptr);
+    ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
+}
+
 TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
     auto& engine = get_test_engine();
 
@@ -85,3 +115,55 @@ TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
         ASSERT_EQ(weights_mem[i], expected[i]);
     }
 }
+
+TEST(post_optimize_weights, weights_reorder_constant_folding_test_dynamic) {
+    auto& engine = get_test_engine();
+    if (engine.get_device_info().supports_immad)
+        return;
+    ov::Shape pshape = { 4, 32 };
+    auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx };
+    auto weights = engine.allocate_memory({pshape, data_types::f32, format::bfyx });
+
+    std::vector<float> weights_data(pshape[0] * pshape[1]);
+    std::iota(weights_data.begin(), weights_data.end(), 0.f);
+    set_values(weights, weights_data);
+
+    topology topology(
+        input_layout("input", in_layout),
+        data("weights", weights),
+        fully_connected("fc", input_info("input"), { "weights" }, "", data_types::f16, {}, 3)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+    auto prog = program::build_program(engine, topology, config, false, true);
+
+    reorder_factory rf;
+    program_wrapper::apply_opt_pass<compile_graph>(*prog);
+    program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
+    program_wrapper::apply_opt_pass<propagate_constants>(*prog);
+
+    ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0"));
+    auto& weights_node = prog->get_node("weights_weights_reorder_0");
+    ASSERT_TRUE(weights_node.is_type<data>());
+
+    size_t align = 16; // os_iyx_osv16 format
+    size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0]
+                                                   : pshape[0] - pshape[0] % align + align;
+    std::vector<float> expected(aligned_b_size * pshape[1], 0.f);
+    size_t input_idx = 0;
+    for (size_t i = 0; i < pshape[0]; ++i) {
+        for (size_t j = 0; j < pshape[1]; ++j) {
+            expected[j * align + i] = weights_data[input_idx++];
+        }
+    }
+
+    auto weights_mem_ptr = weights_node.as<data>().get_attached_memory_ptr();
+    cldnn::mem_lock<float, mem_lock_type::read> weights_mem(weights_mem_ptr, get_test_stream());
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        ASSERT_EQ(weights_mem[i], expected[i]);
+    }
+}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index af69b7bc0cb..d96182b974b 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -2512,7 +2512,9 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {
     auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
     ASSERT_TRUE(reorder_kernel_params != nullptr);
     auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params);
-    ASSERT_TRUE(reorder_impl != nullptr);
+    // cldnn shape agnostic kernel reorder is done in build time
+    // therefore the reorder is no longer in cache, but the program_node of weight data is in the preferred format
+    ASSERT_TRUE(reorder_impl == nullptr);
 
     auto out_l = network.get_output_layout(outputs.begin()->first);
     ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment