[GPU] Improvement for buffer dump (#18542)

* [GPU] Improvement for buffer dump + added OV_GPU_DumpLayersInput to support dump input layers + added OV_GPU_DumpLayersRawBinary to make binary dump + added OV_GPU_LoadDumpRawBinary to use binary dump as input + binary dump naming rule layername_datatype_tensor_format.bin Signed-off-by: Min, Byungil <byungil.min@intel.com>
2023-07-27 11:11:34 +09:00 · 2023-07-27 11:11:34 +09:00 · 83a78eb559
commit 83a78eb559
parent c145d8f1e9
5 changed files with 274 additions and 56 deletions
--- a/src/common/util/include/openvino/util/file_util.hpp
+++ b/src/common/util/include/openvino/util/file_util.hpp
@ -337,6 +337,7 @@ std::vector<uint8_t> load_binary(const std::string& path);
 * @param path - binary file path to store
 */
 void save_binary(const std::string& path, std::vector<uint8_t> binary);
+void save_binary(const std::string& path, const char* binary, size_t bin_size);

 /**
 * @brief Trim OpenVINO project file name path if OpenVINO project directory found.
--- a/src/common/util/src/file_util.cpp
+++ b/src/common/util/src/file_util.cpp
@ -621,6 +621,11 @@ std::vector<uint8_t> ov::util::load_binary(const std::string& path) {
 }

 void ov::util::save_binary(const std::string& path, std::vector<uint8_t> binary) {
+    save_binary(path, reinterpret_cast<const char*>(&binary[0]), binary.size());
+    return;
+}
+
+void ov::util::save_binary(const std::string& path, const char* binary, size_t bin_size) {
 #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
    std::wstring widefilename = ov::util::string_to_wstring(path);
    const wchar_t* filename = widefilename.c_str();
@ -629,7 +634,7 @@ void ov::util::save_binary(const std::string& path, std::vector<uint8_t> binary)
 #endif
    std::ofstream out_file(filename, std::ios::out | std::ios::binary);
    if (out_file.is_open()) {
-        out_file.write(reinterpret_cast<const char*>(&binary[0]), binary.size());
+        out_file.write(binary, bin_size);
    } else {
        throw std::runtime_error("Could not save binary to " + path);
    }
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -94,36 +94,42 @@ private:

 public:
    static const char *prefix;
-    int help;                                   // Print help messages
-    int verbose;                                // Verbose execution
-    int verbose_color;                          // Print verbose color
-    int list_layers;                            // Print list layers
-    int print_multi_kernel_perf;                // Print execution time of each kernel in multi-kernel primitimive
-    int disable_usm;                            // Disable usm usage
-    int disable_onednn;                         // Disable onednn for discrete GPU (no effect for integrated GPU)
-    int disable_onednn_opt_post_ops;            // Disable onednn optimize post operators
-    std::string dump_profiling_data;            // Enables dump of extended performance profiling to specified dir
-    std::string dump_graphs;                    // Dump optimized graph
-    std::string dump_sources;                   // Dump opencl sources
-    std::string dump_layers_path;               // Enable dumping intermediate buffers and set the dest path
-    std::vector<std::string> dump_layers;       // Dump intermediate buffers of specified layers only
-    std::string dry_run_path;                   // Dry run and serialize execution graph into the specified path
-    int dump_layers_dst_only;                   // Dump only output of layers
-    int dump_layers_result;                     // Dump result layers
-    int dump_layers_limit_batch;                // Limit the size of batch to dump
-    int dump_layers_raw;                        // Dump raw data.
-    int base_batch_for_memory_estimation;       // Base batch size to be used in memory estimation
-    std::vector<std::string> after_proc;        // Start inference after the listed processes
-    int serialize_compile;                      // Serialize creating primitives and compiling kernels
-    std::vector<std::string> forced_impl_types; // Force implementation type either ocl or onednn
-    int max_kernels_per_batch;                  // Maximum number of kernels in a batch during compiling kernels
-    int disable_async_compilation;              // Disable async compilation
-    int disable_dynamic_impl;                   // Disable dynamic implementation
-    int disable_runtime_buffer_fusing;          // Disable runtime buffer fusing
-    std::set<int64_t> dump_iteration;           // Dump n-th execution of network.
+    int help;                                       // Print help messages
+    int verbose;                                    // Verbose execution
+    int verbose_color;                              // Print verbose color
+    int list_layers;                                // Print list layers
+    int print_multi_kernel_perf;                    // Print execution time of each kernel in multi-kernel primitimive
+    int disable_usm;                                // Disable usm usage
+    int disable_onednn;                             // Disable onednn for discrete GPU (no effect for integrated GPU)
+    int disable_onednn_opt_post_ops;                // Disable onednn optimize post operators
+    std::string dump_profiling_data;                // Enables dump of extended performance profiling to specified dir
+    std::string dump_graphs;                        // Dump optimized graph
+    std::string dump_sources;                       // Dump opencl sources
+    std::string dump_layers_path;                   // Enable dumping intermediate buffers and set the dest path
+    std::vector<std::string> dump_layers;           // Dump intermediate buffers of specified layers only
+    std::string dry_run_path;                       // Dry run and serialize execution graph into the specified path
+    int dump_layers_dst_only;                       // Dump only output of layers
+    int dump_layers_result;                         // Dump result layers
+    int dump_layers_input;                          // Dump input layers
+    int dump_layers_limit_batch;                    // Limit the size of batch to dump
+    int dump_layers_raw;                            // Dump raw data.
+    int dump_layers_binary;                         // Dump binary data.
+    int base_batch_for_memory_estimation;           // Base batch size to be used in memory estimation
+    std::vector<std::string> after_proc;            // Start inference after the listed processes
+    int serialize_compile;                          // Serialize creating primitives and compiling kernels
+    std::vector<std::string> forced_impl_types;     // Force implementation type either ocl or onednn
+    int max_kernels_per_batch;                      // Maximum number of kernels in a batch during compiling kernels
+    int disable_async_compilation;                  // Disable async compilation
+    int disable_dynamic_impl;                       // Disable dynamic implementation
+    int disable_runtime_buffer_fusing;              // Disable runtime buffer fusing
+    std::set<int64_t> dump_iteration;               // Dump n-th execution of network.
+    std::vector<std::string> load_layers_raw_dump;  // List of layers to load dumped raw binary and filenames
    static const debug_configuration *get_instance();
-    bool is_dumped_layer(const std::string& layerName, bool is_output = false) const;
+    std::vector<std::string> get_filenames_for_matched_layer_loading_binaries(const std::string& id) const;
+    std::string get_name_for_dump(const std::string& file_name) const;
+    bool is_layer_for_dumping(const std::string& layerName, bool is_output = false, bool is_input = false) const;
    bool is_target_iteration(int64_t iteration) const;
+    std::string get_matched_from_filelist(const std::vector<std::string>& file_names, std::string pattern) const;

    struct memory_preallocation_params {
        bool is_initialized = false;
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "openvino/util/file_util.hpp"
+
 #include "intel_gpu/primitives/data.hpp"
 #include "intel_gpu/primitives/mutable_data.hpp"
 #include "intel_gpu/primitives/input_layout.hpp"
@ -247,11 +249,7 @@ void dump<uint32_t>(memory::ptr mem, stream& stream, std::ofstream& file_stream,
 void log_memory_to_file(memory::ptr mem, stream& stream, std::string layerName, bool dump_raw) {
    std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
    GPU_DEBUG_GET_INSTANCE(debug_config);
-    std::string filename = layerName;
-    std::replace(filename.begin(), filename.end(), '\\', '_');
-    std::replace(filename.begin(), filename.end(), '/', '_');
-    std::replace(filename.begin(), filename.end(), ' ', '_');
-    std::replace(filename.begin(), filename.end(), ':', '_');
+    std::string filename = debug_config->get_name_for_dump(layerName);
    filename = debug_config->dump_layers_path + filename + ".txt";
    std::ofstream file_stream(filename);
    if (!mem) {
@ -306,6 +304,25 @@ static uint32_t get_unique_net_id() {
    return ++id_gen;
 }

+static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) {
+    std::string filename;
+    std::string data_type = data_type_traits::name(layout.data_type);
+    std::string format = layout.format.to_string();
+    std::string tensor;
+    auto dims = layout.get_dims();
+    for (size_t r = 0 ; r < layout.get_rank() ; r++) {
+        tensor += ("_" + to_string(dims[r]));
+    }
+
+#ifdef GPU_DEBUG_CONFIG
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    std::string layer_name = debug_config->get_name_for_dump(name);
+    filename = debug_config->dump_layers_path + layer_name
+                + "__" + data_type + "_" + tensor + "__" + format + ".bin";
+#endif
+    return filename;
+}
+
 /*
 Network will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants
 opt pass).
@ -1219,6 +1236,67 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
    };

    for (auto& inst : _exec_order) {
+        // Load binary dump for input layers
+        GPU_DEBUG_IF(!debug_config->load_layers_raw_dump.empty()) {
+            const std::string layer_name = inst->id();
+            auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name);
+            if (!files.empty()) {
+                if (inst->is_input()) {
+                    // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists
+                    auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__");
+                    OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer");
+
+                    OPENVINO_ASSERT(files.size() == get_primitive(inst->id())->outputs_memory_count(), "Mis-match dump file count");
+
+                    for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
+                        auto dump_file = files[0];
+                        if (files.size() > 1 || get_primitive(inst->id())->outputs_memory_count() != 1) {
+                            std::string pattern = "_dst" + std::to_string(i) + "__";
+                            dump_file = debug_config->get_matched_from_filelist(files, pattern);
+                        }
+                        OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump");
+                        GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for " << layer_name << std::endl;
+
+                        std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
+                        OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
+
+                        auto output_mem = get_primitive(layer_name)->output_memory_ptr(i);
+                        OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name);
+
+                        output_mem->copy_from(get_stream(), static_cast<void *>(&bin[0]), true);
+                    }
+                } else {
+                    auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__");
+                    OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name);
+
+                    // Loading input tensors for any layer
+                    auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__");
+                    OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name);
+
+                    OPENVINO_ASSERT(files.size() == get_primitive(inst->id())->dependencies().size(), "Mis-match dump file count");
+
+                    for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
+                        auto dump_file = files[0];
+                        if (files.size() > 1 || get_primitive(inst->id())->dependencies().size() != 1) {
+                            std::string pattern = "_src" + std::to_string(i) + "__";
+                            dump_file = debug_config->get_matched_from_filelist(files, pattern);
+                        }
+                        OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input");
+                        GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for input of " << layer_name << std::endl;
+
+                        std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
+                        OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
+
+                        auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i);
+                        OPENVINO_ASSERT(input_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name);
+
+                        input_mem->copy_from(get_stream(), static_cast<void *>(&bin[0]), true);
+                    }
+                }
+            }
+        }
+
+        // Dump input buffers of 'inst'
        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
            const std::string layer_name = inst->id();
            GPU_DEBUG_IF(debug_config->verbose >= 2) {
@ -1226,36 +1304,74 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
            }

            GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->dump_layers_dst_only == 0 && debug_config->is_dumped_layer(layer_name)) {
+                        debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
+                std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
                for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                    log_memory_to_file(get_primitive(inst->id())->dep_memory_ptr(i),
-                                    get_stream(),
-                                    "program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 0) +
-                                    "_network" + std::to_string(get_id()) +
-                                    "_" + get_iteration_prefix(curr_iter) +
-                                    layer_name + "_src" + std::to_string(i),
-                                    debug_config->dump_layers_raw);
+                    std::string name = "program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 0) +
+                                        "_network" + std::to_string(get_id()) +
+                                        "_" + get_iteration_prefix(curr_iter) +
+                                        layer_name + "_src" + std::to_string(i);
+                    auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i);
+                    GPU_DEBUG_IF(debug_config->dump_layers_binary) {
+                        // Binary dump : raw
+                        auto input_layout = inst->get_input_layout(i);
+                        auto filename = get_file_path_for_binary_dump(input_layout, name);
+
+                        mem_lock<char, mem_lock_type::read> lock(input_mem, get_stream());
+                        ov::util::save_binary(filename, lock.data(), input_mem->size());
+                        GPU_DEBUG_COUT  << " Dump layer src : " << layer_name << " to " << filename << std::endl;
+                        debug_str_for_bin_load += (filename + ",");
+                    } else {
+                        log_memory_to_file(input_mem,
+                                        get_stream(),
+                                        name,
+                                        debug_config->dump_layers_raw);
+                    }
+                }
+
+                GPU_DEBUG_IF(debug_config->dump_layers_binary && !inst->is_input()) {
+                    debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
+                    GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
                }
            }
        }

        execute_primitive(inst, events);

+        // Dump output buffers of 'inst'
        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
            get_stream().finish();
            const std::string layer_name = inst->id();
            auto prog_id = ((get_program() != nullptr) ? get_program()->get_id() : 0);
            auto net_id = get_id();
            GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->is_dumped_layer(layer_name, inst->is_output())) {
-                for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
-                    log_memory_to_file(get_primitive(inst->id())->output_memory_ptr(i),
-                                    get_stream(),
-                                    "program" + std::to_string(prog_id) +
-                                    "_network" + std::to_string(net_id) +
-                                    "_" + get_iteration_prefix(curr_iter) +
-                                    layer_name + "_dst" + std::to_string(i),
-                                    debug_config->dump_layers_raw);
+                        debug_config->is_layer_for_dumping(layer_name, inst->is_output(), inst->is_input())) {
+                std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\""
+                                                        + layer_name + ":";
+                for (size_t i = 0; i < get_primitive(layer_name)->outputs_memory_count(); i++) {
+                    std::string name = "program" + std::to_string(prog_id) +
+                                        "_network" + std::to_string(net_id) +
+                                        "_" + get_iteration_prefix(curr_iter) +
+                                        layer_name + "_dst" + std::to_string(i);
+                    auto output_mem = get_primitive(layer_name)->output_memory_ptr(i);
+                    GPU_DEBUG_IF(debug_config->dump_layers_binary) {
+                        // Binary dump : raw
+                        auto output_layout = inst->get_output_layout(i);
+                        auto filename = get_file_path_for_binary_dump(output_layout, name);
+
+                        mem_lock<char, mem_lock_type::read> lock(output_mem, get_stream());
+                        ov::util::save_binary(filename, lock.data(), output_mem->size());
+                        GPU_DEBUG_COUT  << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
+                        debug_str_for_bin_load += (filename + ",");
+                    } else {
+                        // Text dump
+                        log_memory_to_file(output_mem, get_stream(), name, debug_config->dump_layers_raw);
+                    }
+                }
+
+                GPU_DEBUG_IF(debug_config->dump_layers_binary && inst->is_input()) {
+                    debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
+                    GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
                }
            }
        }
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -119,15 +119,17 @@ static void print_help_messages() {
    message_list.emplace_back("OV_GPU_DumpLayers", "Dump intermediate buffers of specified layers only, separated by space."
                               " Support case-insensitive and regular expression. For example .*conv.*");
    message_list.emplace_back("OV_GPU_DumpLayersResult", "Dump output buffers of result layers only");
+    message_list.emplace_back("OV_GPU_DumpLayersInput",  "Dump intermediate buffers of input layers only");
    message_list.emplace_back("OV_GPU_DumpLayersDstOnly", "Dump only output of layers");
    message_list.emplace_back("OV_GPU_DumpLayersLimitBatch", "Limit the size of batch to dump");
    message_list.emplace_back("OV_GPU_DumpLayersRaw", "If true, dump data is stored in raw memory format.");
+    message_list.emplace_back("OV_GPU_DumpLayersRawBinary", "If true, dump data is stored in binary format.");
    message_list.emplace_back("OV_GPU_DryRunPath", "Dry run and serialize execution graph into the specified path");
    message_list.emplace_back("OV_GPU_BaseBatchForMemEstimation", "Base batch size to be used in memory estimation");
    message_list.emplace_back("OV_GPU_AfterProc", "Run inference after the specified process PIDs are finished, separated by space."
                              " Supported on only on linux.");
    message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
-    message_list.emplace_back("OV_GPU_ForceImplTypes", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
+    message_list.emplace_back("OV_GPU_ForceImplTypes", "Force implementation type of a target primitive or layer. [primitive or layer_name]:[impl_type]"
                              " For example fc:onednn gemm:onednn reduce:ocl do:cpu"
                              " For primitives fc, gemm, do, reduce, concat are supported. Separated by space.");
    message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
@ -139,6 +141,11 @@ static void print_help_messages() {
                              "the following order: number of iterations for pre-allocation(int), max size of single iteration in bytes(int), "
                              "max per-dim allowed diff(int), unconditional buffers preallocation ratio(float). For example for disabling memory"
                              "preallocation at all, you can use OV_GPU_MemPreallocationOptions='0 0 0 1.0'");
+    message_list.emplace_back("OV_GPU_LoadDumpRawBinary",
+                               "Specified layers which are loading dumped binary files generated by OV_GPU_DumpLayersRawBinary debug-config."
+                               " Currently, other layers except input-layer('parameter' type) are loading binaries for only input."
+                               " Different input or output tensors are seperated by ','. Different layers are separated by space. For example, "
+                               " \"[input_layer_name1]:[binary_dumped_file1],[binary_dump_file2] [input_layer_name2]:[binary_dump_1],[binary_dump_2]\"");

    auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
        [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
@ -170,8 +177,10 @@ debug_configuration::debug_configuration()
        , dry_run_path(std::string())
        , dump_layers_dst_only(0)
        , dump_layers_result(0)
+        , dump_layers_input(0)
        , dump_layers_limit_batch(std::numeric_limits<int>::max())
        , dump_layers_raw(0)
+        , dump_layers_binary(0)
        , base_batch_for_memory_estimation(-1)
        , serialize_compile(0)
        , max_kernels_per_batch(0)
@ -190,8 +199,10 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DumpLayersPath", dump_layers_path);
    get_gpu_debug_env_var("DumpLayersLimitBatch", dump_layers_limit_batch);
    get_gpu_debug_env_var("DumpLayersRaw", dump_layers_raw);
+    get_gpu_debug_env_var("DumpLayersRawBinary", dump_layers_binary);
    get_gpu_debug_env_var("DumpLayersDstOnly", dump_layers_dst_only);
    get_gpu_debug_env_var("DumpLayersResult", dump_layers_result);
+    get_gpu_debug_env_var("DumpLayersInput", dump_layers_input);
    get_gpu_debug_env_var("DisableOnednn", disable_onednn);
    get_gpu_debug_env_var("DisableOnednnOptPostOps", disable_onednn_opt_post_ops);
    get_gpu_debug_env_var("DumpProfilingData", dump_profiling_data);
@ -212,6 +223,8 @@ debug_configuration::debug_configuration()
    get_gpu_debug_env_var("DumpIteration", dump_iteration_str);
    std::string mem_preallocation_params_str;
    get_gpu_debug_env_var("MemPreallocationOptions", mem_preallocation_params_str);
+    std::string load_dump_raw_bin_str;
+    get_gpu_debug_env_var("LoadDumpRawBinary", load_dump_raw_bin_str);

    if (help > 0) {
        print_help_messages();
@ -219,7 +232,8 @@ debug_configuration::debug_configuration()
    }

    if (dump_layers_str.length() > 0) {
-        dump_layers_str = " " + dump_layers_str + " "; // Insert delimiter for easier parsing when used
+        // Insert delimiter for easier parsing when used
+        dump_layers_str = " " + dump_layers_str + " ";
        std::stringstream ss(dump_layers_str);
        std::string layer;
        while (ss >> layer) {
@ -228,7 +242,7 @@ debug_configuration::debug_configuration()
    }

    if (forced_impl_types_str.length() > 0) {
-        forced_impl_types_str = " " + forced_impl_types_str + " "; // Insert delimiter for easier parsing when used
+        forced_impl_types_str = " " + forced_impl_types_str + " ";
        std::stringstream ss(forced_impl_types_str);
        std::string type;
        while (ss >> type) {
@ -236,6 +250,16 @@ debug_configuration::debug_configuration()
        }
    }

+    // Parsing for loading binary files
+    if (load_dump_raw_bin_str.length() > 0) {
+        load_dump_raw_bin_str = " " + load_dump_raw_bin_str + " ";
+        std::stringstream ss(load_dump_raw_bin_str);
+        std::string type;
+        while (ss >> type) {
+            load_layers_raw_dump.push_back(type);
+        }
+    }
+
    if (dump_iteration_str.size() > 0) {
        dump_iteration_str = " " + dump_iteration_str + " ";
        std::istringstream ss(dump_iteration_str);
@ -305,12 +329,77 @@ const debug_configuration *debug_configuration::get_instance() {
 #endif
 }

-bool debug_configuration::is_dumped_layer(const std::string& layer_name, bool is_output) const {
+std::vector<std::string> debug_configuration::get_filenames_for_matched_layer_loading_binaries(const std::string& id) const {
+    std::vector<std::string> file_names;
 #ifdef GPU_DEBUG_CONFIG
+    if (load_layers_raw_dump.empty())
+        return file_names;
+
+    for (const auto& load_layer : load_layers_raw_dump) {
+        size_t file = load_layer.rfind(":");
+        if (file != std::string::npos) {
+            if (id == load_layer.substr(0, file)) {
+                auto file_name_str = load_layer.substr(file + 1);
+                size_t head = 0;
+                size_t found = 0;
+                do {
+                    found = file_name_str.find(",", head);
+                    if (found != std::string::npos)
+                        file_names.push_back(file_name_str.substr(head, (found - head)));
+                    else
+                        file_names.push_back(file_name_str.substr(head));
+
+                    head = found+1;
+                    GPU_DEBUG_LOG << " Layer name loading raw dump : " << load_layer.substr(0, file) << " / the dump file : "
+                                << file_names.back() << std::endl;
+                } while (found != std::string::npos);
+
+                return file_names;
+            }
+        }
+    }
+#endif
+
+    return file_names;
+}
+
+std::string debug_configuration::get_matched_from_filelist(const std::vector<std::string>& file_names, std::string pattern) const {
+#ifdef GPU_DEBUG_CONFIG
+    for (const auto& file : file_names) {
+        auto found = file.find(pattern);
+        if (found != std::string::npos) {
+            return file;
+        }
+    }
+#endif
+    return std::string();
+}
+
+std::string debug_configuration::get_name_for_dump(const std::string& file_name) const {
+    std::string filename = file_name;
+#ifdef GPU_DEBUG_CONFIG
+    std::replace(filename.begin(), filename.end(), '\\', '_');
+    std::replace(filename.begin(), filename.end(), '/', '_');
+    std::replace(filename.begin(), filename.end(), ' ', '_');
+    std::replace(filename.begin(), filename.end(), ':', '_');
+#endif
+    return filename;
+}
+
+bool debug_configuration::is_layer_for_dumping(const std::string& layer_name, bool is_output, bool is_input) const {
+#ifdef GPU_DEBUG_CONFIG
+    // Dump result layer
    if (is_output == true && dump_layers_result == 1 &&
        (layer_name.find("constant:") == std::string::npos))
        return true;
-    if (dump_layers.empty() && dump_layers_result == 0)
+    // Dump all layers
+    if (dump_layers.empty() && dump_layers_result == 0 && dump_layers_input == 0)
+        return true;
+
+    // Dump input layers
+    size_t pos = layer_name.find(':');
+    auto type = layer_name.substr(0, pos);
+    if (is_input == true && type == "parameter" && dump_layers_input == 1)
        return true;

    auto is_match = [](const std::string& layer_name, const std::string& pattern) -> bool {
@ -328,6 +417,7 @@ bool debug_configuration::is_dumped_layer(const std::string& layer_name, bool is
        std::regex re(upper_pattern);
        return std::regex_match(upper_layer_name, re);
    };
+
    auto iter = std::find_if(dump_layers.begin(), dump_layers.end(), [&](const std::string& dl){
        return is_match(layer_name, dl);
    });