[GPU] Add debug config for disabled primitive fusing (#18938)
* Add debug config for disabled primitive fusing * Apply comment * Fix failed TCs on CI
This commit is contained in:
parent
d278ff5786
commit
a2807f1edb
@ -94,40 +94,41 @@ private:
|
||||
|
||||
public:
|
||||
static const char *prefix;
|
||||
int help; // Print help messages
|
||||
int verbose; // Verbose execution
|
||||
int verbose_color; // Print verbose color
|
||||
int list_layers; // Print list layers
|
||||
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
||||
int disable_usm; // Disable usm usage
|
||||
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
|
||||
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
|
||||
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
|
||||
std::string dump_graphs; // Dump optimized graph
|
||||
std::string dump_sources; // Dump opencl sources
|
||||
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
|
||||
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
|
||||
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
||||
int dump_layers_dst_only; // Dump only output of layers
|
||||
int dump_layers_result; // Dump result layers
|
||||
int dump_layers_input; // Dump input layers
|
||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||
int dump_layers_raw; // Dump raw data.
|
||||
int dump_layers_binary; // Dump binary data.
|
||||
int dump_runtime_memory_pool; // Dump memory pool status at each iteration
|
||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||
std::vector<std::string> after_proc; // Start inference after the listed processes
|
||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
|
||||
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
|
||||
int disable_async_compilation; // Disable async compilation
|
||||
int disable_dynamic_impl; // Disable dynamic implementation
|
||||
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
|
||||
int disable_memory_reuse; // Disable memmory reuse among layers
|
||||
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
|
||||
int disable_runtime_skip_reorder; // Disable runtime skip reorder
|
||||
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
|
||||
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
|
||||
int help; // Print help messages
|
||||
int verbose; // Verbose execution
|
||||
int verbose_color; // Print verbose color
|
||||
int list_layers; // Print list layers
|
||||
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
||||
int disable_usm; // Disable usm usage
|
||||
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
|
||||
int disable_onednn_opt_post_ops; // Disable onednn optimize post operators
|
||||
std::string dump_profiling_data; // Enables dump of extended performance profiling to specified dir
|
||||
std::string dump_graphs; // Dump optimized graph
|
||||
std::string dump_sources; // Dump opencl sources
|
||||
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
|
||||
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
|
||||
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
||||
int dump_layers_dst_only; // Dump only output of layers
|
||||
int dump_layers_result; // Dump result layers
|
||||
int dump_layers_input; // Dump input layers
|
||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||
int dump_layers_raw; // Dump raw data.
|
||||
int dump_layers_binary; // Dump binary data.
|
||||
int dump_runtime_memory_pool; // Dump memory pool status at each iteration
|
||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||
std::vector<std::string> after_proc; // Start inference after the listed processes
|
||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||
std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
|
||||
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
|
||||
int disable_async_compilation; // Disable async compilation
|
||||
int disable_dynamic_impl; // Disable dynamic implementation
|
||||
int disable_runtime_buffer_fusing; // Disable runtime buffer fusing
|
||||
int disable_memory_reuse; // Disable memmory reuse among layers
|
||||
int disable_build_time_weight_reorder_for_dynamic_nodes; // Disable build time weight reordering for dynamic nodes
|
||||
int disable_runtime_skip_reorder; // Disable runtime skip reorder
|
||||
int disable_primitive_fusing; // Disable primitive fusing
|
||||
std::set<int64_t> dump_iteration; // Dump n-th execution of network.
|
||||
std::vector<std::string> load_layers_raw_dump; // List of layers to load dumped raw binary and filenames
|
||||
static const debug_configuration *get_instance();
|
||||
std::vector<std::string> get_filenames_for_matched_layer_loading_binaries(const std::string& id) const;
|
||||
std::string get_name_for_dump(const std::string& file_name) const;
|
||||
|
@ -543,11 +543,24 @@ void program::pre_optimize_graph(bool is_internal) {
|
||||
|
||||
reorder_factory rf;
|
||||
if (optimize_data) {
|
||||
apply_opt_pass<prepare_primitive_fusing_through>();
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) {
|
||||
#else
|
||||
{
|
||||
#endif
|
||||
apply_opt_pass<prepare_primitive_fusing_through>();
|
||||
}
|
||||
|
||||
apply_opt_pass<pre_replace_deconv>(lo);
|
||||
|
||||
apply_opt_pass<prepare_primitive_fusing>(lo);
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
GPU_DEBUG_IF(!debug_config->disable_primitive_fusing) {
|
||||
#else
|
||||
{
|
||||
#endif
|
||||
apply_opt_pass<prepare_primitive_fusing>(lo);
|
||||
}
|
||||
|
||||
apply_opt_pass<select_preferred_formats>(lo);
|
||||
|
||||
|
@ -137,6 +137,7 @@ static void print_help_messages() {
|
||||
message_list.emplace_back("OV_GPU_DisableDynamicImpl", "Disable dynamic implementation");
|
||||
message_list.emplace_back("OV_GPU_DisableRuntimeBufferFusing", "Disable runtime buffer fusing");
|
||||
message_list.emplace_back("OV_GPU_DisableMemoryReuse", "Disable memory reuse");
|
||||
message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing");
|
||||
message_list.emplace_back("OV_GPU_DumpIteration", "Dump n-th execution of network, separated by space.");
|
||||
message_list.emplace_back("OV_GPU_MemPreallocationOptions", "Controls buffer pre-allocation feature. Expects 4 values separated by space in"
|
||||
"the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
|
||||
@ -191,7 +192,8 @@ debug_configuration::debug_configuration()
|
||||
, disable_runtime_buffer_fusing(0)
|
||||
, disable_memory_reuse(0)
|
||||
, disable_build_time_weight_reorder_for_dynamic_nodes(0)
|
||||
, disable_runtime_skip_reorder(0) {
|
||||
, disable_runtime_skip_reorder(0)
|
||||
, disable_primitive_fusing(0) {
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
get_gpu_debug_env_var("Help", help);
|
||||
get_common_debug_env_var("Verbose", verbose);
|
||||
@ -228,6 +230,7 @@ debug_configuration::debug_configuration()
|
||||
get_gpu_debug_env_var("DisableMemoryReuse", disable_memory_reuse);
|
||||
get_gpu_debug_env_var("DisableBuildTimeWeightReorderForDynamicNodes", disable_build_time_weight_reorder_for_dynamic_nodes);
|
||||
get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
|
||||
get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing);
|
||||
std::string dump_iteration_str;
|
||||
get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
|
||||
std::string mem_preallocation_params_str;
|
||||
|
Loading…
Reference in New Issue
Block a user