[GPU] Improve OV_GPU_DumpLayers debug configuration (#15719)

Co-authored-by: Kim,SungEun <sungeun.kim@intel.com>
2023-02-19 23:57:19 +09:00 · 2023-02-19 23:57:19 +09:00 · b7bcef6864
commit b7bcef6864
parent 1d5839fb92
3 changed files with 65 additions and 38 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -93,6 +93,7 @@ public:
    int dump_layers_dst_only;                   // Dump only output of layers
    int dump_layers_result;                     // Dump result layers
    int dump_layers_limit_batch;                // Limit the size of batch to dump
    int dump_layers_raw;                        // Dump raw data.
    int base_batch_for_memory_estimation;       // Base batch size to be used in memory estimation
    std::vector<std::string> after_proc;        // Start inference after the listed processes
    int serialize_compile;                      // Serialize creating primitives and compiling kernels
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -146,7 +146,7 @@ size_t get_x_pitch(const layout& layout) {
 }
 template <class T>
-void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
+void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
    auto&& size = mem->get_layout().get_tensor();
    GPU_DEBUG_GET_INSTANCE(debug_config);
@ -155,11 +155,15 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    tmp_size.batch[0] = batch_size;
    if (tmp_size == size) {
        file_stream << "shape: " << size.to_string() << " ";
-        file_stream << "(count: " << size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << std::endl;
+        file_stream << "(count: " << size.count()
                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
                    << (dump_raw ? " raw data" : "") << std::endl;
    } else {
        file_stream << "shape: " << tmp_size.to_string() << " ";
-        file_stream << "(count: " << tmp_size.count() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+        file_stream << "(count: " << tmp_size.count()
-            << ", original shape: " << size.to_string() << ")" << std::endl;
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
                    << ", original shape: " << size.to_string() << ")"
                    << (dump_raw ? " raw data" : "") << std::endl;
    }
    if (size.count() == 0) {
@ -172,29 +176,35 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
    auto x_pitch = get_x_pitch(mem->get_layout());
    std::stringstream buffer;
-    for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
+    if (!dump_raw) {
-        for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
+        for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
-            for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
+            for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
-                for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
+                for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
-                    for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
+                    for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
-                        for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
+                        for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
-                            cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
+                            for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
-                            size_t input_it = mem->get_layout().get_linear_offset(t);
+                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
                                size_t input_it = mem->get_layout().get_linear_offset(t);
-                            for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
+                                for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
-                                buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
+                                    buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
                                }
                            }
                        }
                    }
                }
            }
        }
    } else {
        for (size_t i = 0; i < lock.size(); ++i) {
            buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
        }
    }
    file_stream << buffer.str();
 }
 template <>
-void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream) {
+void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
    auto&& l = mem->get_layout();
    file_stream << "shape: ";
@ -207,23 +217,29 @@ void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream)
    mem_lock<uint32_t, mem_lock_type::read> lock(mem, stream);
    auto mem_ptr = lock.data();
-    for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
+    if (!dump_raw) {
-        for (cldnn::tensor::value_type f = 0; f < (cldnn::tensor::value_type)ceil_div(l.feature(), 32); ++f) {
+        for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
-            for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) {
+            for (cldnn::tensor::value_type f = 0; f < (cldnn::tensor::value_type)ceil_div(l.feature(), 32); ++f) {
-                for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
+                for (cldnn::tensor::value_type z = 0; z < l.spatial(2); ++z) {
-                    for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
+                    for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
-                        cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, 0));
+                        for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
-                        size_t input_it = mem->get_layout().get_linear_offset(t);
+                            cldnn::tensor t(cldnn::batch(b), cldnn::feature(f), cldnn::spatial(x, y, z, 0));
-                        file_stream << mem_ptr[input_it] << std::endl;
+                            size_t input_it = mem->get_layout().get_linear_offset(t);
                            file_stream << mem_ptr[input_it] << std::endl;
                        }
                    }
                }
            }
        }
    } else {
        for (size_t i = 0; i < lock.size(); ++i) {
            file_stream << std::fixed << std::setprecision(6) << mem_ptr[i] << std::endl;
        }
    }
 }
-void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName) {
+void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName, bool dump_raw) {
-    std::cout << "Dump " << layerName << std::endl;
+    std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
    GPU_DEBUG_GET_INSTANCE(debug_config);
    std::string filename = layerName;
    std::replace(filename.begin(), filename.end(), '\\', '_');
@ -239,17 +255,17 @@ void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName)
    auto mem_dt = mem->get_layout().data_type;
    if (mem_dt == cldnn::data_types::f32)
-        dump<float>(mem, stream, file_stream);
+        dump<float>(mem, stream, file_stream, dump_raw);
    else if (mem_dt == cldnn::data_types::f16)
-        dump<half_t>(mem, stream, file_stream);
+        dump<half_t>(mem, stream, file_stream, dump_raw);
    else if (mem_dt == cldnn::data_types::bin)
-        dump<uint32_t>(mem, stream, file_stream);
+        dump<uint32_t>(mem, stream, file_stream, dump_raw);
    else if (mem_dt == cldnn::data_types::i32)
-        dump<int32_t>(mem, stream, file_stream);
+        dump<int32_t>(mem, stream, file_stream, dump_raw);
    else if (mem_dt == cldnn::data_types::i8)
-        dump<int8_t>(mem, stream, file_stream);
+        dump<int8_t>(mem, stream, file_stream, dump_raw);
    else if (mem_dt == cldnn::data_types::u8)
-        dump<uint8_t>(mem, stream, file_stream);
+        dump<uint8_t>(mem, stream, file_stream, dump_raw);
 }
 void wait_for_the_turn() {
@ -272,7 +288,7 @@ void wait_for_the_turn() {
 #else
 void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {}
-void log_memory_to_file(memory::ptr, stream&, std::string) {}
+void log_memory_to_file(memory::ptr, stream&, std::string, bool dump_raw) {}
 void wait_for_the_turn() {}
 #endif
 }  // namespace
@ -988,11 +1004,14 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
                std::cerr << inst->id() << std::endl;
            }
-            GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 &&
+            GPU_DEBUG_IF(debug_config->dump_layers_dst_only == 0 && debug_config->is_dumped_layer(layer_name)) {
                            debug_config->is_dumped_layer(layer_name)) {
                for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                    log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i), get_stream(),
+                    log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i),
-                                       layer_name + "_src_" + std::to_string(i));
+                                       get_stream(),
                                       "program" + std::to_string(get_program()->get_id()) +
                                       "_network" + std::to_string(get_id()) +
                                       "_" + layer_name + "_src" + std::to_string(i),
                                       debug_config->dump_layers_raw);
                }
            }
        }
@ -1004,8 +1023,12 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
            const std::string layer_name = inst->id();
            GPU_DEBUG_IF(debug_config->is_dumped_layer(layer_name, inst->is_output())) {
                for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
-                    log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i), get_stream(),
+                    log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i),
-                                       layer_name + "_dst_" + std::to_string(i));
+                                       get_stream(),
                                       "program" + std::to_string(get_program()->get_id()) +
                                       "_network" + std::to_string(get_id()) +
                                       "_" + layer_name + "_dst" + std::to_string(i),
                                       debug_config->dump_layers_raw);
                }
            }
        }
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -117,6 +117,7 @@ static void print_help_messages() {
    message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only");
    message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
    message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
    message_list.emplace_back("OV_GPU_DumpLayersRaw", "If true, dump data is stored in raw memory format.");
    message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
    message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
    message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
@ -156,6 +157,7 @@ debug_configuration::debug_configuration()
        , dump_layers_dst_only(0)
        , dump_layers_result(0)
        , dump_layers_limit_batch(std::numeric_limits<int>::max())
        , dump_layers_raw(0)
        , base_batch_for_memory_estimation(-1)
        , serialize_compile(0)
        , max_kernels_per_batch(0) {
@ -168,6 +170,7 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DumpSources", dump_sources);
    get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
    get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
    get_gpu_debug_env_var("DumpLayersRaw", dump_layers_raw);
    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
    get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
    get_gpu_debug_env_var("DisableOnednn", disable_onednn);