Add Debug Config for maximum kernels per batch (#12068)

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park 2022-07-07 20:26:51 +09:00 committed by GitHub
parent cd6c7da91c
commit 32937ab7ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 27 additions and 18 deletions

View File

@ -23,25 +23,27 @@ namespace cldnn {
class debug_configuration {
private:
debug_configuration();
public:
static const char *prefix;
int help; // Print help messages
int verbose; // Verbose execution
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
std::string dump_graphs; // Dump optimized graph
std::string dump_sources; // Dump opencl sources
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
int dump_layers_dst_only; // Dump only output of layers
int dump_layers_result; // Dump result layers
int dump_layers_limit_batch; // Limit the size of batch to dump
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
std::vector<std::string> after_proc; // Start inference after the listed processes
int serialize_compile; // Serialize creating primitives and compiling kernels
std::string forced_impl_type; // Force implementation type either ocl or onednn
int help; // Print help messages
int verbose; // Verbose execution
int print_multi_kernel_perf; // Print execution time of each kernel in multi-kernel primitimive
int disable_usm; // Disable usm usage
int disable_onednn; // Disable onednn for discrete GPU (no effect for integrated GPU)
std::string dump_graphs; // Dump optimized graph
std::string dump_sources; // Dump opencl sources
std::string dump_layers_path; // Enable dumping intermediate buffers and set the dest path
std::vector<std::string> dump_layers; // Dump intermediate buffers of specified layers only
std::string dry_run_path; // Dry run and serialize execution graph into the specified path
int dump_layers_dst_only; // Dump only output of layers
int dump_layers_result; // Dump result layers
int dump_layers_limit_batch; // Limit the size of batch to dump
int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
std::vector<std::string> after_proc; // Start inference after the listed processes
int serialize_compile; // Serialize creating primitives and compiling kernels
std::string forced_impl_type; // Force implementation type either ocl or onednn
int max_kernels_per_batch; // Maximum number of kernels in a batch during compiling kernels
static const debug_configuration *get_instance();
bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
};

View File

@ -120,6 +120,7 @@ static void print_help_messages() {
message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
"For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:ocl and reduce:onednn are supported");
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
[](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
@ -150,7 +151,8 @@ debug_configuration::debug_configuration()
, dump_layers_limit_batch(std::numeric_limits<int>::max())
, base_batch_for_memory_estimation(-1)
, serialize_compile(0)
, forced_impl_type(std::string()) {
, forced_impl_type(std::string())
, max_kernels_per_batch(0) {
#ifdef GPU_DEBUG_CONFIG
get_gpu_debug_env_var("Help", help);
get_common_debug_env_var("Verbose", verbose);
@ -171,6 +173,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("AfterProc", after_proc_str);
get_gpu_debug_env_var("SerialCompile", serialize_compile);
get_gpu_debug_env_var("ForceImplType", forced_impl_type);
get_gpu_debug_env_var("MaxKernelsPerBatch", max_kernels_per_batch);
if (help > 0) {
print_help_messages();

View File

@ -148,6 +148,10 @@ bool kernels_cache::is_cache_enabled() const {
}
size_t kernels_cache::get_max_kernels_per_batch() const {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->max_kernels_per_batch >= 1) {
return static_cast<size_t>(debug_config->max_kernels_per_batch);
}
return 8;
}