[GPU] Do weight reorder for fc shape agnostic kernels at build time (#18829)

* Do weight reorder at build time

* Add test
This commit is contained in:
Taylor Yeonbok Lee 2023-07-31 20:40:06 -07:00 committed by GitHub
parent 5813b6d27a
commit 974ef62ce6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 133 additions and 10 deletions

View File

@ -151,6 +151,7 @@ public:
return outputs;
} // ToDo: redesign reorder-inputs pass to make it const as_well as get_engine and get options
bool is_loop_body() const { return is_body_program; }
bool is_internal_program() const { return is_internal; }
const nodes_ordering& get_processing_order() const;
nodes_ordering& get_processing_order();
uint32_t get_prog_id() { return prog_id; }
@ -278,6 +279,7 @@ private:
std::vector<program_node*> outputs;
nodes_ordering processing_order;
std::unique_ptr<pass_manager> pm;
bool is_internal;
bool is_body_program;
std::unique_ptr<ImplementationsCache> _impls_cache;
const size_t _impls_cache_capacity = 10000;

View File

@ -49,6 +49,7 @@ struct WeightsReorderParams {
bool get_grouped() const { return _grouped; }
void set_input_layout(const layout& layout) { _in_layout = layout; }
void set_output_layout(const layout& layout) { _out_layout = layout; }
protected:
layout _in_layout;

View File

@ -122,7 +122,8 @@ public:
int disable_async_compilation; // Disable async compilation
int disable_dynamic_impl; // Disable dynamic implementation
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
int disable_memory_reuse; // Disable memmory reuse among layers
int disable_memory_reuse; // Disable memmory reuse among layers
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
static const debug_configuration *get_instance();

View File

@ -37,9 +37,24 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
if (!impl)
return;
if (impl->is_dynamic())
return;
if (impl->is_dynamic()) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->disable_build_time_weight_reorder_for_dynamic_nodes) {
return;
}
// TODO: To relax current limitation w.r.t the future optimization of weight reorder process
// In dynamic shape, selected weight format can change in runtime. However reordering blocked format to blocked format is not fully verified yet.
// So we need to enable other primiives such as convolution with verifying reorder b/w the possible layouts
// Also we skip weight reorder for onednn impl because onednn fully connected layer is using simple format, therefore
// reordering to cldnn shape_agnostic_kernel's preferred blocked format at build time does not helpful for the performance.
// This situation might be changed once onednn shape agnostic kernel is used in the future.
if (p.is_internal_program())
return;
if (node.get_preferred_impl_type() == impl_types::onednn)
return;
if (node.type() != fully_connected::type_id())
return;
}
// Don't run impl selection to avoid double compilation of reorder kernels
// in main program and internal program for constant propagation
auto set_implementation = [&p, &impl](program_node& weights_reorder_node) {
@ -69,13 +84,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
!prev_node.has_fused_primitives() &&
!prev_node.as<reorder>().has_mean() &&
prev_node.as<reorder>().get_primitive()->subtract_per_feature.empty();
if (impl->is_dynamic()) {
if (weights_reorder_params->get_output_layout().compatible(prev_node.get_output_layout())) {
// if compatible, it can be reinterpreted, thus no need to reorder at build time
continue;
}
// Need to restore the original shape
auto updated_output_layout = weights_reorder_params->get_output_layout();
auto orig_rank = prev_node.get_output_layout().get_partial_shape().size();
auto weight_format_dims = format::dimension(weights_reorder_params->get_output_layout().format);
updated_output_layout.set_partial_shape(
updated_output_layout.get_tensor().get_partial_shape(orig_rank, weight_format_dims));
if (updated_output_layout != weights_reorder_params->get_output_layout())
weights_reorder_params->set_output_layout(updated_output_layout);
}
if (can_be_fused) {
// Need to update input data_type for correct merging format reorder with precision reorder
data_types input_dtype = prev_node.get_input_layouts()[0].data_type;
auto updated_input_layout = weights_reorder_params->get_input_layout();
updated_input_layout.data_type = input_dtype;
weights_reorder_params->set_input_layout(updated_input_layout);
auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid,
weights_reorder_params);
auto& weights_reorder_node = p.get_or_create(weights_reorder.first);

View File

@ -157,6 +157,7 @@ program::program(engine& engine_ref,
_config(config),
_task_executor(task_executor),
processing_order(),
is_internal(is_internal),
is_body_program(is_body_program) {
_config.apply_user_properties(_engine.get_device_info());
init_primitives();
@ -181,7 +182,8 @@ program::program(engine& engine_ref,
_stream(_engine.create_stream(config)),
_config(config),
_task_executor(task_executor),
processing_order() {
processing_order(),
is_internal(is_internal) {
_config.apply_user_properties(_engine.get_device_info());
init_primitives();
init_program();

View File

@ -188,7 +188,8 @@ debug_configuration::debug_configuration()
, disable_async_compilation(0)
, disable_dynamic_impl(0)
, disable_runtime_buffer_fusing(0)
, disable_memory_reuse(0) {
, disable_memory_reuse(0)
, disable_build_time_weight_reorder_for_dynamic_nodes(0) {
#ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose);
@ -222,6 +223,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DisableDynamicImpl", disable_dynamic_impl);
get_gpu_debug_env_var("DisableRuntimeBufferFusing", disable_runtime_buffer_fusing);
get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
std::string dump_iteration_str;
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
std::string mem_preallocation_params_str;

View File

@ -63,8 +63,11 @@ TEST(kernel_impl_params_relevance, weights_layout) {
auto fc_inst = std::dynamic_pointer_cast<fully_connected_inst>(inst);
ASSERT_TRUE(fc_inst != nullptr);
// 6. Requset instance's weights memory, compare it with original weights buffer and check
// 6. The weight memory of fc node is reordered at build time for fully_connected_gpu_bf_tiled kernel
ASSERT_EQ(fc_inst->get_node().get_dependency(1).get_output_layout().format, format::os_iyx_osv16);
// 7. Requset instance's weights memory, compare it with original weights buffer and check
// if original layout is used (required for `fully_connected_gpu_bfyx_ref` kernel)
auto used_weights_memory = fc_inst->weights_memory()->get_layout();
ASSERT_EQ(weights_data->get_layout(), used_weights_memory);
ASSERT_EQ(weights_data->get_layout().compatible(used_weights_memory), true);
}

View File

@ -35,6 +35,36 @@ TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) {
ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
}
TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test_dynamic) {
auto& engine = get_test_engine();
if (engine.get_device_info().supports_immad)
return;
auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx });
auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx };
topology topology(
input_layout("input", in_layout),
input_layout("weights", weights->get_layout()),
reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16),
fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16, {}, 3)
);
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, true);
reorder_factory rf;
program_wrapper::apply_opt_pass<compile_graph>(*prog);
program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
ASSERT_TRUE(has_node(*prog, "reorder_dt"));
ASSERT_NE(prog->get_node("fc").get_selected_impl(), nullptr);
ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
}
TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
auto& engine = get_test_engine();
@ -85,3 +115,55 @@ TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
ASSERT_EQ(weights_mem[i], expected[i]);
}
}
TEST(post_optimize_weights, weights_reorder_constant_folding_test_dynamic) {
auto& engine = get_test_engine();
if (engine.get_device_info().supports_immad)
return;
ov::Shape pshape = { 4, 32 };
auto in_layout = layout{ ov::PartialShape{ov::Dimension(1), ov::Dimension(-1), ov::Dimension(32)}, data_types::f16, format::bfyx };
auto weights = engine.allocate_memory({pshape, data_types::f32, format::bfyx });
std::vector<float> weights_data(pshape[0] * pshape[1]);
std::iota(weights_data.begin(), weights_data.end(), 0.f);
set_values(weights, weights_data);
topology topology(
input_layout("input", in_layout),
data("weights", weights),
fully_connected("fc", input_info("input"), { "weights" }, "", data_types::f16, {}, 3)
);
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, true);
reorder_factory rf;
program_wrapper::apply_opt_pass<compile_graph>(*prog);
program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
program_wrapper::apply_opt_pass<propagate_constants>(*prog);
ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0"));
auto& weights_node = prog->get_node("weights_weights_reorder_0");
ASSERT_TRUE(weights_node.is_type<data>());
size_t align = 16; // os_iyx_osv16 format
size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0]
: pshape[0] - pshape[0] % align + align;
std::vector<float> expected(aligned_b_size * pshape[1], 0.f);
size_t input_idx = 0;
for (size_t i = 0; i < pshape[0]; ++i) {
for (size_t j = 0; j < pshape[1]; ++j) {
expected[j * align + i] = weights_data[input_idx++];
}
}
auto weights_mem_ptr = weights_node.as<data>().get_attached_memory_ptr();
cldnn::mem_lock<float, mem_lock_type::read> weights_mem(weights_mem_ptr, get_test_stream());
for (size_t i = 0; i < expected.size(); ++i) {
ASSERT_EQ(weights_mem[i], expected[i]);
}
}

View File

@ -2512,7 +2512,9 @@ TEST(fully_connected_gpu, has_cached_weights_reorder) {
auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
ASSERT_TRUE(reorder_kernel_params != nullptr);
auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params);
ASSERT_TRUE(reorder_impl != nullptr);
// cldnn shape agnostic kernel reorder is done in build time
// therefore the reorder is no longer in cache, but the program_node of weight data is in the preferred format
ASSERT_TRUE(reorder_impl == nullptr);
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment