[GPU] Add DumpLayersLimitBatch config to GPU debug utils. (#9196)

* Add DumpLayersLimitBatch config to GPU debug utils. + Support OV_GPU_Help config + Only run first inference if OV_GPU_DumpLayersPath is set. + Fix dump graph bug. * Apply some comments * Remove unnecessary code.
2021-12-20 14:19:53 +09:00 · 2021-12-20 14:19:53 +09:00 · a4518ae595
commit a4518ae595
parent abcd7486a9
6 changed files with 68 additions and 10 deletions
--- a/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
+++ b/inference-engine/thirdparty/clDNN/api/intel_gpu/runtime/debug_configuration.hpp
@ -24,6 +24,7 @@ private:
    debug_configuration();
 public:
    static const char *prefix;
+    int help;                       // Print help messages
    int verbose;                    // Verbose execution
    int print_multi_kernel_perf;    // Print execution time of each kernel in multi-kernel primitimive
    int disable_usm;                // Disable usm usage
@ -34,6 +35,7 @@ public:
    std::string dump_layers;        // Dump intermediate buffers of specified layers only, separated by space
    std::string dry_run_path;       // Dry run and serialize execution graph into the specified path
    int dump_layers_dst_only;       // Dump only output of layers
+    int dump_layers_limit_batch;    // Limit the size of batch to dump
    int base_batch_for_memory_estimation; // Base batch size to be used in memory estimation
    static const debug_configuration *get_instance();
 };
--- a/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/runtime/debug_configuration.cpp
@ -3,7 +3,9 @@
 //

 #include "intel_gpu/runtime/debug_configuration.hpp"
+#include <algorithm>
 #include <iostream>
+#include <iomanip>
 #include <memory>
 #include <vector>
 #include <sstream>
@ -97,10 +99,39 @@ void get_common_debug_env_var(const std::string &var, T &val) {
    return get_debug_env_var(var, val, allowed_option_prefixes);
 }

+static void print_help_messages() {
+    std::vector<std::pair<std::string, std::string>> message_list;
+    message_list.emplace_back("OV_GPU_Help", "Print help messages");
+    message_list.emplace_back("OV_GPU_Verbose", "Verbose execution");
+    message_list.emplace_back("OV_GPU_PrintMultiKernelPerf", "Print execution time of each kernel in multi-kernel primitimive");
+    message_list.emplace_back("OV_GPU_DisableUsm", "Disable usm usage");
+    message_list.emplace_back("OV_GPU_DisableOnednn", "Disable onednn for discrete GPU (no effect for integrated GPU)");
+    message_list.emplace_back("OV_GPU_DumpGraphs", "Dump optimized graph");
+    message_list.emplace_back("OV_GPU_DumpSources", "Dump opencl sources");
+    message_list.emplace_back("OV_GPU_DumpLayersPath", "Enable dumping intermediate buffers and set the dest path");
+    message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space");
+    message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
+    message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
+    message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
+    message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
+
+    auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
+        [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
+            return a.first.size() < b.first.size();
+    });
+    int name_width = static_cast<int>(max_name_length_item->first.size()) + 2;
+
+    GPU_DEBUG_COUT << "Supported environment variables for debugging" << std::endl;
+    for (auto& p : message_list) {
+        GPU_DEBUG_COUT << " - " << std::left << std::setw(name_width) << p.first + ": " << p.second << std::endl;
+    }
+}
+
 #endif

 debug_configuration::debug_configuration()
-        : verbose(0)
+        : help(0)
+        , verbose(0)
        , print_multi_kernel_perf(0)
        , disable_usm(0)
        , dump_graphs(std::string())
@ -110,8 +141,10 @@ debug_configuration::debug_configuration()
        , dump_layers_dst_only(0)
        , dry_run_path(std::string())
        , disable_onednn(0)
+        , dump_layers_limit_batch(std::numeric_limits<int>::max())
        , base_batch_for_memory_estimation(-1) {
 #ifdef GPU_DEBUG_CONFIG
+    get_gpu_debug_env_var("Help", help);
    get_common_debug_env_var("Verbose", verbose);
    get_gpu_debug_env_var("PrintMultiKernelPerf", print_multi_kernel_perf);
    get_gpu_debug_env_var("DisableUsm", disable_usm);
@ -120,10 +153,16 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
    get_gpu_debug_env_var("DumpLayers", dump_layers);
    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
+    get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
    get_gpu_debug_env_var("DisableOnednn", disable_onednn);
    get_gpu_debug_env_var("DryRunPath", dry_run_path);
    get_gpu_debug_env_var("BaseBatchForMemEstimation", base_batch_for_memory_estimation);

+    if (help > 0) {
+        print_help_messages();
+        exit(0);
+    }
+
    if (dump_layers.length() > 0)
        dump_layers = " " + dump_layers + " "; // Insert delimiter for easier parsing when used
 #endif
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
@ -436,10 +436,5 @@ void graph_initializations::run(program& p) {
    }
    set_outputs(p);
    p.get_processing_order().calc_processing_order(p);
-
-    for (auto& node : p.get_processing_order()) {
-        if (!node->is_type<data>())
-            node->get_output_layout();
-    }
 }
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@ -110,8 +110,18 @@ template <class T>
 static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    auto&& size = mem->get_layout().size;

-    file_stream << "shape: " << size.to_string() << " ";
-    file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+            << ", original shape: " << size.to_string() << ")" << std::endl;
+    }

    mem_lock<T, mem_lock_type::read> lock(mem, stream);
    auto mem_ptr = lock.data();
@ -119,7 +129,7 @@ static void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    std::stringstream buffer;

    for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
-        for (cldnn::tensor::value_type b = 0; b < size.batch[0]; ++b) {
+        for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
            for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
                for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
                    for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@ -1225,8 +1225,13 @@ program::primitives_info program::get_current_stage_info() const {

 void program::save_pass_info(std::string pass_name) {
    // TODO: Directory path here can be probably changed to some bool flag
-    if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty())
+    if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty()) {
+        for (auto& node : this->get_processing_order()) {
+            if (!node->is_type<data>())
+                node->get_output_layout();
+        }
        optimizer_passes_info.emplace_back(pass_name, get_current_stage_info());
+    }
 }

 void program::add_optimized_primitive_info(primitive_id optimized_primitive_id,
--- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
@ -557,6 +557,13 @@ void InferRequest::enqueue() {

    internal_outputs.clear();
    internal_outputs = m_graph->GetNetwork()->execute(dependencies);
+
+    // If dump layers path is set, only runs first inference.
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
+        GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
+        exit(0);
+    }
 }

 void InferRequest::wait_notify() {