[GPU] Add debug config for disabled primitive fusing (#18938)

* Add debug config for disabled primitive fusing

* Apply comment

* Fix failed TCs on CI
This commit is contained in:
Andrew Kwangwoong Park 2023-08-08 10:02:48 +09:00 committed by GitHub
parent d278ff5786
commit a2807f1edb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 54 additions and 37 deletions

View File

@ -94,40 +94,41 @@ private:
public:
static const char *prefix;
int help; // Print help messages
int verbose; // Verbose execution
int verbose_color; // Print verbose color
int list_layers; // Print list layers
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
std::string dump_graphs; // Dump optimized graph
std::string dump_sources; // Dump opencl sources
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
int dump_layers_dst_only; // Dump only output of layers
int dump_layers_result; // Dump result layers
int dump_layers_input; // Dump input layers
int dump_layers_limit_batch; // Limit the size of batch to dump
int dump_layers_raw; // Dump raw data.
int dump_layers_binary; // Dump binary data.
int dump_runtime_memory_pool; // Dump memory pool status at each iteration
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
std::vector<std::string> after_proc; // Start inference after the listed processes
int serialize_compile; // Serialize creating primitives and compiling kernels
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
int disable_async_compilation; // Disable async compilation
int disable_dynamic_impl; // Disable dynamic implementation
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
int disable_memory_reuse; // Disable memmory reuse among layers
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
int disable_runtime_skip_reorder; // Disable runtime skip reorder
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
int help; // Print help messages
int verbose; // Verbose execution
int verbose_color; // Print verbose color
int list_layers; // Print list layers
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
std::string dump_graphs; // Dump optimized graph
std::string dump_sources; // Dump opencl sources
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
int dump_layers_dst_only; // Dump only output of layers
int dump_layers_result; // Dump result layers
int dump_layers_input; // Dump input layers
int dump_layers_limit_batch; // Limit the size of batch to dump
int dump_layers_raw; // Dump raw data.
int dump_layers_binary; // Dump binary data.
int dump_runtime_memory_pool; // Dump memory pool status at each iteration
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
std::vector<std::string> after_proc; // Start inference after the listed processes
int serialize_compile; // Serialize creating primitives and compiling kernels
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
int disable_async_compilation; // Disable async compilation
int disable_dynamic_impl; // Disable dynamic implementation
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
int disable_memory_reuse; // Disable memmory reuse among layers
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
int disable_runtime_skip_reorder; // Disable runtime skip reorder
int disable_primitive_fusing; // Disable primitive fusing
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
static const debug_configuration *get_instance();
std::vector<std::string> get_filenames_for_matched_layer_loading_binaries(const std::string& id) const;
std::string get_name_for_dump(const std::string& file_name) const;

View File

@ -543,11 +543,24 @@ void program::pre_optimize_graph(bool is_internal) {
reorder_factory rf;
if (optimize_data) {
apply_opt_pass<prepare_primitive_fusing_through>();
GPU_DEBUG_GET_INSTANCE(debug_config);
#ifdef GPU_DEBUG_CONFIG
GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) {
#else
{
#endif
apply_opt_pass<prepare_primitive_fusing_through>();
}
apply_opt_pass<pre_replace_deconv>(lo);
apply_opt_pass<prepare_primitive_fusing>(lo);
#ifdef GPU_DEBUG_CONFIG
GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) {
#else
{
#endif
apply_opt_pass<prepare_primitive_fusing>(lo);
}
apply_opt_pass<select_preferred_formats>(lo);

View File

@ -137,6 +137,7 @@ static void print_help_messages() {
message_list.emplace_back("OV_GPU_DisableDynamicImpl", "Disable dynamic implementation");
message_list.emplace_back("OV_GPU_DisableRuntimeBufferFusing", "Disable runtime buffer fusing");
message_list.emplace_back("OV_GPU_DisableMemoryReuse", "Disable memory reuse");
message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing");
message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
"the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
@ -191,7 +192,8 @@ debug_configuration::debug_configuration()
, disable_runtime_buffer_fusing(0)
, disable_memory_reuse(0)
, disable_build_time_weight_reorder_for_dynamic_nodes(0)
, disable_runtime_skip_reorder(0) {
, disable_runtime_skip_reorder(0)
, disable_primitive_fusing(0) {
#ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose);
@ -228,6 +230,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing);
std::string dump_iteration_str;
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
std::string mem_preallocation_params_str;