Add Debug Config for maximum kernels per batch (#12068)
Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
parent
cd6c7da91c
commit
32937ab7ca
@ -23,25 +23,27 @@ namespace cldnn {
|
||||
class debug_configuration {
|
||||
private:
|
||||
debug_configuration();
|
||||
|
||||
public:
|
||||
static const char *prefix;
|
||||
int help; // Print help messages
|
||||
int verbose; // Verbose execution
|
||||
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
||||
int disable_usm; // Disable usm usage
|
||||
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
|
||||
std::string dump_graphs; // Dump optimized graph
|
||||
std::string dump_sources; // Dump opencl sources
|
||||
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
|
||||
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
|
||||
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
||||
int dump_layers_dst_only; // Dump only output of layers
|
||||
int dump_layers_result; // Dump result layers
|
||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||
std::vector<std::string> after_proc; // Start inference after the listed processes
|
||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||
std::string forced_impl_type; // Force implementation type either ocl or onednn
|
||||
int help; // Print help messages
|
||||
int verbose; // Verbose execution
|
||||
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
|
||||
int disable_usm; // Disable usm usage
|
||||
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
|
||||
std::string dump_graphs; // Dump optimized graph
|
||||
std::string dump_sources; // Dump opencl sources
|
||||
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
|
||||
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
|
||||
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
|
||||
int dump_layers_dst_only; // Dump only output of layers
|
||||
int dump_layers_result; // Dump result layers
|
||||
int dump_layers_limit_batch; // Limit the size of batch to dump
|
||||
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
|
||||
std::vector<std::string> after_proc; // Start inference after the listed processes
|
||||
int serialize_compile; // Serialize creating primitives and compiling kernels
|
||||
std::string forced_impl_type; // Force implementation type either ocl or onednn
|
||||
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
|
||||
static const debug_configuration *get_instance();
|
||||
bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
|
||||
};
|
||||
|
@ -120,6 +120,7 @@ static void print_help_messages() {
|
||||
message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
|
||||
message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
|
||||
"For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:ocl and reduce:onednn are supported");
|
||||
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
|
||||
|
||||
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
|
||||
[](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
|
||||
@ -150,7 +151,8 @@ debug_configuration::debug_configuration()
|
||||
, dump_layers_limit_batch(std::numeric_limits<int>::max())
|
||||
, base_batch_for_memory_estimation(-1)
|
||||
, serialize_compile(0)
|
||||
, forced_impl_type(std::string()) {
|
||||
, forced_impl_type(std::string())
|
||||
, max_kernels_per_batch(0) {
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
get_gpu_debug_env_var("Help", help);
|
||||
get_common_debug_env_var("Verbose", verbose);
|
||||
@ -171,6 +173,7 @@ debug_configuration::debug_configuration()
|
||||
get_gpu_debug_env_var("AfterProc", after_proc_str);
|
||||
get_gpu_debug_env_var("SerialCompile", serialize_compile);
|
||||
get_gpu_debug_env_var("ForceImplType", forced_impl_type);
|
||||
get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
|
||||
|
||||
if (help > 0) {
|
||||
print_help_messages();
|
||||
|
@ -148,6 +148,10 @@ bool kernels_cache::is_cache_enabled() const {
|
||||
}
|
||||
|
||||
size_t kernels_cache::get_max_kernels_per_batch() const {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->max_kernels_per_batch >= 1) {
|
||||
return static_cast<size_t>(debug_config->max_kernels_per_batch);
|
||||
}
|
||||
return 8;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user