[GPU] Refactored GPU verbose utils (#14549)

* [GPU] Refactored GPU verbose utils * [GPU] More refacoring * Fix build
2022-12-20 11:06:01 +04:00 · 2022-12-20 11:06:01 +04:00 · 6bc4f00695
commit 6bc4f00695
parent e306cbc67a
31 changed files with 266 additions and 536 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@ -7,22 +7,63 @@
 #include <mutex>
 #include <vector>

+namespace ov {
+namespace intel_gpu {
+
+// Verbose log levels:
+// DISABLED - silent mode (Default)
+// INFO - Minimal verbose:
+//     * May log basic info about device, plugin configuration, model and execution statistics
+//     * Mustn't log any info that depend on neither number of iterations or number of layers in the model
+//     * Minimal impact on both load time and inference time
+// LOG - Enables graph optimization verbose:
+//     * Includes info from log level INFO
+//     * May log info about applied graph transformations, memory allocations and other model compilation time steps
+//     * May impact compile_model() execution time
+//     * Minimal impact on inference time
+// TRACE - Enables basic execution time verbose
+//     * Includes info from log level LOG
+//     * May log info during model execution
+//     * May log short info about primitive execution
+//     * May impact network execution time
+// TRACE_DETAIL - Max verbosity
+//     * Includes info from log level TRACE
+//     * May log any stage and print detailed info about each execution step
+enum class LogLevel : int8_t {
+    DISABLED = 0,
+    INFO = 1,
+    LOG = 2,
+    TRACE = 3,
+    TRACE_DETAIL = 4
+};
+
+}  // namespace intel_gpu
+}  // namespace ov
+
 #ifdef GPU_DEBUG_CONFIG
 #define GPU_DEBUG_IF(cond) if (cond)
 #define GPU_DEBUG_PROFILED_STAGE(stage) \
    auto stage_prof = cldnn::instrumentation::profiled_stage<primitive_inst>(\
        !cldnn::debug_configuration::get_instance()->dump_profiling_data.empty(), *this, stage)
 #define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) stage_prof.set_cache_hit(val)
+
+#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
+                                                std::cout << cldnn::debug_configuration::prefix
+#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
 #else
 #define GPU_DEBUG_IF(cond) if (0)
 #define GPU_DEBUG_PROFILED_STAGE(stage)
 #define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val)
+#define GPU_DEBUG_LOG_RAW(min_verbose_level) if (0) std::cout << cldnn::debug_configuration::prefix
 #endif

-#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix
 // Macro below is inserted to avoid unused variable warning when GPU_DEBUG_CONFIG is OFF
 #define GPU_DEBUG_GET_INSTANCE(name) auto name = cldnn::debug_configuration::get_instance(); (void)(name);

+#define GPU_DEBUG_INFO              GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::INFO)
+#define GPU_DEBUG_LOG               GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::LOG)
+#define GPU_DEBUG_TRACE             GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE)
+#define GPU_DEBUG_TRACE_DETAIL      GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE_DETAIL)

 namespace cldnn {

--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@ -22,10 +22,8 @@

 using namespace cldnn;

-#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) {                                                         \
-            GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; }
-#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_IF(debug_config->verbose >= 2) {                                                     \
-            GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Replace node: " << (id) << std::endl; }
+#define LOG_NODE_REMOVAL(id)      GPU_DEBUG_LOG_PASS << "Remove node: " << (id) << std::endl;
+#define LOG_NODE_REPLACEMENT(id)  GPU_DEBUG_LOG_PASS << "Replace node: " << (id) << std::endl;


 remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
@ -34,7 +32,6 @@ remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, b
    remove_output_reorders(remove_output_reorders) {}

 void remove_redundant_reorders::run(program& p) {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    auto update_implementation = [&](program_node& node) {
        if (!update_implementations)
            return;
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@ -34,8 +34,6 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
 namespace {

 std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-
    std::map<program_node*, format::type> fmt_map;

 #ifdef ENABLE_ONEDNN_FOR_GPU
@ -59,9 +57,7 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
    if (onednn_impls_counter < 1 && lo.get_optimization_attributes().use_onednn_impls) {
        should_update_fmt_map = true;
        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
-        GPU_DEBUG_IF(debug_config->verbose >= 1) {
-            GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
-        }
+        GPU_DEBUG_LOG << "Disable oneDNN implementations globally" << std::endl;
    }

    if (should_update_fmt_map)
@ -277,10 +273,7 @@ void propagate_formats_rec(std::map<program_node*, format::type>& fmt_map,

    fmt = travel_direction_wrapper<dir>::first(first_fmt, second_fmt);
    fmt_map.at(node) = fmt;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
-    }
+    GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;

    for (auto next : travel_direction_wrapper<dir>::next_nodes(node)) {
        if (!next->is_in_data_flow())
@ -312,10 +305,7 @@ void propagate_formats_rec<direction_e::backwards>(std::map<program_node*, forma

    fmt = travel_direction_wrapper<direction_e::backwards>::first(first_fmt, second_fmt);
    fmt_map.at(node) = fmt;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
-    }
+    GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;

    for (auto next : travel_direction_wrapper<direction_e::backwards>::next_nodes(node)) {
        if (!next.first->is_in_data_flow())
@ -541,7 +531,6 @@ const char *dir_msg(direction_e dir) {
 // If there is layout mismatch between two layers, add reorder
 template <direction_e dir>
 void insert_reorders_in_dir(program& p, const std::map<program_node*, format::type>& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    auto fmt = fmt_map.at(node);

    auto next_cpy = travel_direction_wrapper<dir>::next_nodes(node);
@ -565,10 +554,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
        in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
        out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);

-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << "  " << node->id() << " --> " << next->id() << " ## "
-                    << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
-        }
+        GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << "  " << node->id() << " --> " << next->id() << " ## "
+                      << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;

        if (in_layout.format == format::any || out_layout.format == format::any)
            continue;
@ -580,10 +567,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
        auto reorder = reorder_pair.first;
        if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
            auto& reorder_node = p.get_or_create(reorder);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << "  " << reorder_node.id()
-                        << "  Reorder is added" << std::endl;
-            }
+            GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << "  " << reorder_node.id()
+                          << "  Reorder is added" << std::endl;
            p.add_intermediate(reorder_node,
                               *travel_direction_wrapper<dir>::second(node, next),
                               *travel_direction_wrapper<dir>::first(node, next),
@ -595,7 +580,6 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
 template <>
 void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<program_node*, format::type>& fmt_map,
                                                    reorder_factory& rf, layout_optimizer& lo, program_node* node) {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    auto fmt = fmt_map.at(node);

    auto next_cpy = travel_direction_wrapper<direction_e::backwards>::next_nodes(node);
@ -619,11 +603,9 @@ void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<p
        in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
        out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);

-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards)
-                    << "  " << node->id() << " --> " << next.first->id() << " ## "
-                    << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
-        }
+        GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards)
+                      << "  " << node->id() << " --> " << next.first->id() << " ## "
+                      << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;

        if (in_layout.format == format::any || out_layout.format == format::any)
            continue;
@ -635,10 +617,8 @@ void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<p
        auto reorder = reorder_pair.first;
        if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
            auto& reorder_node = p.get_or_create(reorder);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << "  " << reorder_node.id()
-                        << "  Reorder is added" << std::endl;
-            }
+            GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << "  " << reorder_node.id()
+                          << "  Reorder is added" << std::endl;
            p.add_intermediate(reorder_node,
                               *travel_direction_wrapper<direction_e::backwards>::second(node, next.first),
                               *travel_direction_wrapper<direction_e::backwards>::first(node, next.first),
@ -684,30 +664,26 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)

    auto fmt_map = get_preferred_formats(p, lo);

-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
-        for (auto& node_fmt : fmt_map) {
-            if (node_fmt.second != format::any) {
-                GPU_DEBUG_COUT << "[clDNN][reorder_inputs]   " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
-            }
+    GPU_DEBUG_LOG_PASS << "Preferred formats:" << std::endl;
+    for (auto& node_fmt : fmt_map) {
+        if (node_fmt.second != format::any) {
+            GPU_DEBUG_LOG_PASS << "  " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
        }
    }

    propagate_formats(p, fmt_map, lo);
    minimize_local_reorders(p, fmt_map, lo);

-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
-        for (auto node_ptr : p.get_processing_order()) {
-            if (fmt_map.count(node_ptr) == 0)
-                continue;
+    GPU_DEBUG_LOG_PASS << "Selected formats:" << std::endl;
+    for (auto node_ptr : p.get_processing_order()) {
+        if (fmt_map.count(node_ptr) == 0)
+            continue;

-            auto fmt = fmt_map.at(node_ptr);
-            GPU_DEBUG_COUT << "[clDNN][reorder_inputs]   " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
-        }
+        auto fmt = fmt_map.at(node_ptr);
+        GPU_DEBUG_LOG_PASS << "  " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
    }

-    GPU_DEBUG_IF(debug_config->verbose >= 1) {
+    GPU_DEBUG_IF(debug_config->verbose >= 2) {
        reorder_cnt total_reorder_count = std::accumulate(
            p.get_processing_order().begin(),
            p.get_processing_order().end(),
@ -719,8 +695,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
            return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
        });
        // Divide results by two as above function will each reorder from both sides
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
+        GPU_DEBUG_LOG_PASS << "Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
+        GPU_DEBUG_LOG_PASS << "Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;

        // Count number of reorders that will be fused
        size_t nodes_with_fusing = 0;
@ -736,8 +712,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
                }
            }
        }
-        GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
-        GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
+        GPU_DEBUG_LOG_PASS << "Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
+        GPU_DEBUG_LOG_PASS << "----------------------------------------------" << std::endl;
    }

    insert_reorders(p, fmt_map, rf, lo);
@ -802,7 +778,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
        }
    };

-    const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node<convolution>& conv_node) {
+    const auto reorder_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
        {
            // reorder weights convolution
            auto& weights = conv_node.weights();
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
@ -32,7 +32,6 @@ void select_preferred_formats::run(program& p) {
        return;

 #ifdef ENABLE_ONEDNN_FOR_GPU
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    for (auto n : p.get_processing_order()) {
        // Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
        try {
@ -47,9 +46,7 @@ void select_preferred_formats::run(program& p) {

            _lo.select_preferred_formats_for_onednn(*n, prim_desc);
        } catch(std::exception &exception) {
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                std::cout << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
-            }
+            GPU_DEBUG_INFO << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
        }
    }
 #endif  // ENABLE_ONEDNN_FOR_GPU
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp
@ -68,18 +68,15 @@ protected:
            dnnl::memory::desc desc = onednn::layout_to_memory_desc(a_zp->get_layout(), dnnl::memory::format_tag::a, true);
            args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, a_zp->get_onednn_memory(desc)});

-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 3) {
-                auto dnnl_mem = a_zp->get_onednn_memory(desc);
-                void *mapped_ptr = dnnl_mem.map_data();
-                if (mapped_ptr) {
-                    GPU_DEBUG_COUT << instance.id() << " activations_zero_points: ";
-                    for (size_t i = 0; i < desc.get_size(); ++i) {
-                        std::cout << static_cast<int32_t*>(mapped_ptr)[i] << " ";
-                    }
-                    std::cout << std::endl;
-                    dnnl_mem.unmap_data(mapped_ptr);
+            auto dnnl_mem = a_zp->get_onednn_memory(desc);
+            void *mapped_ptr = dnnl_mem.map_data();
+            if (mapped_ptr) {
+                GPU_DEBUG_TRACE_DETAIL << instance.id() << " activations_zero_points: ";
+                for (size_t i = 0; i < desc.get_size(); ++i) {
+                    GPU_DEBUG_TRACE_DETAIL << static_cast<int32_t*>(mapped_ptr)[i] << " ";
                }
+                GPU_DEBUG_TRACE_DETAIL << std::endl;
+                dnnl_mem.unmap_data(mapped_ptr);
            }
        }

@ -88,18 +85,15 @@ protected:
            dnnl::memory::desc desc = onednn::layout_to_memory_desc(w_zp->get_layout(), dnnl::memory::format_tag::a, true);
            args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, w_zp->get_onednn_memory(desc)});

-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 3) {
-                auto dnnl_mem = w_zp->get_onednn_memory(desc);
-                void *mapped_ptr = dnnl_mem.map_data();
-                if (mapped_ptr) {
-                    GPU_DEBUG_COUT << instance.id() << " weights_zero_points: ";
-                    for (size_t i = 0; i < desc.get_size(); ++i) {
-                        std::cout << static_cast<int32_t*>(mapped_ptr)[i] << " ";
-                    }
-                    std::cout << std::endl;
-                    dnnl_mem.unmap_data(mapped_ptr);
+            auto dnnl_mem = w_zp->get_onednn_memory(desc);
+            void *mapped_ptr = dnnl_mem.map_data();
+            if (mapped_ptr) {
+                GPU_DEBUG_TRACE_DETAIL << instance.id() << " weights_zero_points: ";
+                for (size_t i = 0; i < desc.get_size(); ++i) {
+                    GPU_DEBUG_TRACE_DETAIL << static_cast<int32_t*>(mapped_ptr)[i] << " ";
                }
+                GPU_DEBUG_TRACE_DETAIL << std::endl;
+                dnnl_mem.unmap_data(mapped_ptr);
            }
        }

--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@ -23,6 +23,8 @@

 #include <fstream>

+#define GPU_DEBUG_LOG_PASS    GPU_DEBUG_LOG << "[" << get_name() << "] "
+
 namespace cldnn {
 class base_pass {
    friend class pass_manager;
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@ -1350,8 +1350,8 @@ impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node)
                preferred_type = impl_types::cpu;

            if (node.id() == forced_impl_type.substr(0, found_type)) {
-                GPU_DEBUG_COUT << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
-                    << forced_impl_type.substr(found_type + 1) << std::endl;
+                GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
+                              << forced_impl_type.substr(found_type + 1) << std::endl;
                return preferred_type;
            }
        }
@ -1819,7 +1819,6 @@ format layout_optimizer::get_preferred_format(program_node& node) {

 #ifdef ENABLE_ONEDNN_FOR_GPU
 void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc) {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    if (node.is_input() || !are_data_types_suitable_for_onednn(node)) {
        return;
    }
@ -1868,10 +1867,8 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
                    node.set_preferred_output_fmt(usr, dst_fmt);
            }

-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                std::cout << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
-                 << " For index : " << idx << std::endl;
-            }
+            GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
+                          << " For index : " << idx << std::endl;
        }
    }

--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@ -890,9 +890,8 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "NetworkImpl::Execute");
    // Wait for previous execution completion
    reset_execution(false);
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 1)
-        GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
+    GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl;
+    GPU_DEBUG_TRACE << "Start network execution" << std::endl;

    std::vector<memory::ptr> in_out_mem;
    auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
@ -928,6 +927,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
    auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());

    set_arguments();
+    GPU_DEBUG_GET_INSTANCE(debug_config);
    for (auto& inst : _exec_order) {
        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
            const std::string layer_name = inst->id();
@ -1120,10 +1120,7 @@ void network::allocate_primitive_instance(program_node const& node) {
    if (_primitives.count(node.id()))
        return;

-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 4) {
-        GPU_DEBUG_COUT << node.id() << ": allocate primitive instance" << std::endl;
-    }
+    GPU_DEBUG_TRACE_DETAIL << node.id() << ": allocate primitive instance" << std::endl;

    auto inst = node.type()->create_instance(*this, node);

@ -1191,10 +1188,7 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
        // Allocate and transfer memory
        auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
        device_mem->copy_from(get_stream(), inst_mem);
-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << node.id() << ": constant]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << node.id() << ": constant]" << std::endl;
        _memory_pool->release_memory(&inst_mem, node.id(), get_id());
        instance->set_output_memory(device_mem);
    }
--- a/src/plugins/intel_gpu/src/graph/pass_manager.cpp
+++ b/src/plugins/intel_gpu/src/graph/pass_manager.cpp
@ -48,11 +48,13 @@ void pass_manager::run(program& p, base_pass& pass) {
    using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
    using Time = std::chrono::high_resolution_clock;

+    GPU_DEBUG_LOG << "Run pass " << pass.get_name() << std::endl;
    auto start = Time::now();
    pass.run(p);
    auto stop = Time::now();
    std::chrono::duration<float> fs = stop - start;
    ms opt_pass_time = std::chrono::duration_cast<ms>(fs);
+    GPU_DEBUG_LOG << "Pass " << pass.get_name() << " execution time: " << opt_pass_time.count() << " ms" << std::endl;

    p.save_pass_info(pass.get_name());

--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -142,7 +142,6 @@ void primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t i
 }

 void primitive_inst::update_shape() {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);

    bool input_shape_changed = false;
@ -199,9 +198,7 @@ void primitive_inst::update_shape() {
        // Events may be not created for in-order queue, so take them for OOO queue only
        if (_network.has_event(dep.id()) && queue_type == queue_types::out_of_order) {
            dependencies_events.push_back(_network.get_primitive_event(dep_id));
-            GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                GPU_DEBUG_COUT << id() << ": shape infer waits for " << i << " dependency\n";
-            }
+            GPU_DEBUG_TRACE_DETAIL << id() << ": shape infer waits for " << i << " dependency\n";
        }
        auto dep_mem = _network.get_output_memory(dep_id);
        memory_deps.insert({i, dep_mem});
@ -221,10 +218,8 @@ void primitive_inst::update_shape() {
    auto update_output_layout = [&](layout& layout, size_t idx) {
        layout.data_padding = padding::max(_node->get_primitive()->output_paddings[idx], layout.data_padding);
        if (_impl_params->get_output_layout(idx) != layout) {
-            GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                GPU_DEBUG_COUT << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
-                               << " now: " << layout.to_short_string() << std::endl;
-            }
+            GPU_DEBUG_TRACE_DETAIL << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
+                                   << " now: " << layout.to_short_string() << std::endl;
            set_shape_change();
        }
        _impl_params->output_layouts[idx] = layout;
@ -265,16 +260,12 @@ void primitive_inst::realloc_if_needed() {
    bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;

    if (can_reuse_buffer) {
-        GPU_DEBUG_IF(debug_config->verbose >= 4) {
-            GPU_DEBUG_COUT << id() << ": reuse previously allocated output buffer" << std::endl;
-        }
+        GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
        _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
    } else {
-        GPU_DEBUG_IF(debug_config->verbose >= 4) {
-            GPU_DEBUG_COUT << id() << ": realloc output memory. "
-                           <<  " Current buffer_size=" << max_output_layout_size
-                           <<  " Requested buffer_size=" << actual_layout.count() << std::endl;
-        }
+        GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
+                               <<  " Current buffer_size=" << max_output_layout_size
+                               <<  " Requested buffer_size=" << actual_layout.count() << std::endl;
        _outputs = allocate_outputs(&updated_params);
        // TODO : need to handle multiple outputs
        max_output_layout_size = updated_params.output_layouts[0].count();
@ -285,7 +276,6 @@ void primitive_inst::realloc_if_needed() {

 void primitive_inst::update_impl() {
    GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    auto prev_impl_str =  _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
    auto extend_to_6d = [this](ov::PartialShape ps) -> std::vector<size_t> {
        // For shape_of we extend shape with 1-s to 6d rank to make kernel simpler
@ -321,7 +311,7 @@ void primitive_inst::update_impl() {
        return seed;
    };

-    auto update_shape_info = [this, extend_to_6d, debug_config, prev_impl_str](const kernel_impl_params& params) {
+    auto update_shape_info = [this, extend_to_6d, prev_impl_str](const kernel_impl_params& params) {
        mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
        size_t offset = 0;
        for (size_t i = 0; i < _node->get_dependencies().size(); i++) {
@ -337,17 +327,14 @@ void primitive_inst::update_impl() {
            for (size_t j = 0; j < output_shape.size(); j++)
                lock[offset++] = static_cast<int32_t>(output_shape[j]);
        }
-        GPU_DEBUG_IF(debug_config->verbose >= 4) {
-            std::stringstream s;
-            s << "shapes: ";
-            for (size_t i = 0; i < offset; i++)
-                s << lock[i] << " ";
-            GPU_DEBUG_COUT << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl;
-        }
+        std::stringstream s;
+        s << "shapes: ";
+        for (size_t i = 0; i < offset; i++)
+            s << lock[i] << " ";
+        GPU_DEBUG_TRACE_DETAIL << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl;
    };

    if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
-        GPU_DEBUG_GET_INSTANCE(debug_config);
        // Update param if fake_alignment is available
        auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
        auto layout_key = get_layout_key(updated_params);
@ -359,10 +346,7 @@ void primitive_inst::update_impl() {
            if (has_cached_impl) {
                _impl = cache.get(layout_key)->clone();
                GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
-
-                GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                    GPU_DEBUG_COUT << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
-                }
+                GPU_DEBUG_TRACE_DETAIL << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
            }
        }
        if (!has_cached_impl) {
@ -402,10 +386,9 @@ void primitive_inst::update_impl() {
                kernels_cache.reset();
                std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
                cache.add(layout_key, _impl->clone());
-                GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                    auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
-                    GPU_DEBUG_COUT << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
-                }
+
+                auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
+                GPU_DEBUG_TRACE_DETAIL << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
            }
        }

@ -436,14 +419,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
                    subgraph->set_input_data(d.first->id(), actual_mem);
                }
            }
-            GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                GPU_DEBUG_COUT << "[Start] Executing unfused subgraph of " << id() << std::endl;
-            }
-
+            GPU_DEBUG_TRACE_DETAIL << "[Start] Executing unfused subgraph of " << id() << std::endl;
            auto outputs = subgraph->execute(events);
-            GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                GPU_DEBUG_COUT << "[End] Finished executing unfused subgraph of " << id() << std::endl;
-            }
+            GPU_DEBUG_TRACE_DETAIL << "[End] Finished executing unfused subgraph of " << id() << std::endl;

            auto last_fd = _impl_params->fused_desc.back();
            auto last_prim_id = last_fd.desc->id;
@ -476,28 +454,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
    }
    on_execute();

-    GPU_DEBUG_IF(debug_config->verbose >= 1) {
-        std::ostringstream in_addr;
-        // buffer_ptr() only support usm_memory
-        for (size_t i = 0; i < this->dependencies().size(); i++) {
-            auto in_mem = dep_memory_ptr(i);
-            if (in_mem) {
-                in_addr << in_mem->buffer_ptr();
-                if (i < this->dependencies().size() - 1) {
-                    in_addr << ", ";
-                }
-            }
-        }
-        auto out_mem = output_memory_ptr();
-        auto out_alloc_type = out_mem ? out_mem->get_allocation_type() : allocation_type::unknown;
-        auto out_ptr = out_mem ? out_mem->buffer_ptr() : nullptr;
-        auto impl_name = _impl->get_kernel_name();
-
-        GPU_DEBUG_COUT << id() << ": execute " << impl_name << ". Memory type: "
-                       << out_alloc_type << ", in_usm("
-                       << in_addr.str() << "), out_usm("
-                       << out_ptr << ")" << std::endl;
-    }
+    GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << std::endl;

    if (_exec_deps.empty() && dependencies.empty()) {
        dependencies = events;
@ -695,10 +652,7 @@ void primitive_inst::allocate_internal_buffers(void) {
    // allocate intermediate memory for the updated layout of buffer
    std::vector<memory::cptr> intermediates_memory;
    for (auto layout : ibuf_layouts) {
-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << _node->id() << ": internal buf]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf]" << std::endl;
        auto alloc_type = allocation_type::unknown;
        if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
            alloc_type = engine.get_preferred_memory_allocation_type();
@ -719,8 +673,6 @@ event::ptr primitive_inst::update_weights() {
    if (!weightable_node)
        return nullptr;

-    GPU_DEBUG_GET_INSTANCE(debug_config);
-
    auto& weights_params = _impl->_weights_reorder_params;
    bool requires_reorder = weights_params.engine != kernel_selector::GenericKernelParams::Engine::NONE &&
                            (!_impl_params->reordered_weights || _impl_params->reordered_weights->get_layout() != from_weights_tensor(weights_params.dest));
@ -744,17 +696,13 @@ event::ptr primitive_inst::update_weights() {
        if (layout_key != "") {
            auto& cache = get_network().get_in_mem_kernels_cache();
            if (cache.has(layout_key)) {
-                GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                    GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
-                                   << " to " << expected_layout.to_short_string() << std::endl;
-                }
+                GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
+                                       << " to " << expected_layout.to_short_string() << std::endl;
                GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
                kernel = cache.get(layout_key);
            } else {
-                GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                    GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout.to_short_string()
-                                   << " to " << expected_layout.to_short_string() << std::endl;
-                }
+                GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
+                                       << " to " << expected_layout.to_short_string() << std::endl;
                auto& kernels_cache = get_network().get_kernels_cache();
                auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
                kernels_cache.compile();
@ -768,9 +716,7 @@ event::ptr primitive_inst::update_weights() {

        bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count();
        if (can_reuse) {
-            GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                GPU_DEBUG_COUT << id() << ": reuse weights memory" << std::endl;
-            }
+            GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights memory" << std::endl;
            _impl_params->reordered_weights = engine.reinterpret_buffer(*_impl_params->reordered_weights, expected_layout);
        } else {
            auto alloc_type = engine.get_preferred_memory_allocation_type();
@ -783,6 +729,7 @@ event::ptr primitive_inst::update_weights() {
        stream.set_arguments(*kernel, weights_params.clKernel->params, args);
        auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true);

+        GPU_DEBUG_GET_INSTANCE(debug_config);
        GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
            stream.wait_for_events({ev});
        }
@ -851,15 +798,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
    bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false;
    auto use_lockable_memory = is_output_buffer(_node) || is_cpu || is_any_user_cpu(_node.get_users()) ||
                               !_engine.supports_allocation(allocation_type::usm_device);
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
    const auto& alloc_type = use_lockable_memory ? lockable_mem_type
        : usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;

    if ((is_internal && (_node.can_be_optimized() || _node.is_type<generic_layer>())) || (memory_reuse_by_user == false)) {
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
        return get_memory_from_pool(_engine,
                layout,
                _node.id(),
@ -868,21 +812,15 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
                false);
    } else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
            _engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
        return _engine.allocate_memory(layout, allocation_type::usm_device, false);
    } else if (is_internal && !_node.is_output() && _node.is_type<input_layout>()) {
        // Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive
        // or just reuse primitive's memory
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << _node.id() << ": constant]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
        return _engine.allocate_memory(layout, alloc_type, false);
    } else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
        return _engine.allocate_memory(layout, alloc_type);
    } else {
        return get_memory_from_pool(_engine,
@ -940,10 +878,7 @@ std::string primitive_inst::generic_to_string(program_node const& node, const ch
 }

 cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 4) {
-        GPU_DEBUG_COUT << id() << ": Use unfused subgraph due to unexpected fusions\n";
-    }
+    GPU_DEBUG_TRACE_DETAIL << id() << ": Use unfused subgraph due to unexpected fusions\n";
    if (!_unfused_subgraph) {
        topology t;

--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -719,17 +719,12 @@ void program::transfer_memory_to_device() {


            if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
-                GPU_DEBUG_GET_INSTANCE(debug_config);
-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl;
-                }
+                GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
                // Allocate and transfer memory
                auto device_mem = mem.get_engine()->allocate_memory(data_node_layout, allocation_type::usm_device, false);
                device_mem->copy_from(get_stream(), mem);
                data_node.attach_memory(device_mem);
-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl;
-                }
+                GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
                const_cast<memory::ptr&>(data_node.get_primitive()->mem).reset();
                // TODO: Do we need finish call here? Maybe call it in network::execute() ?
                get_stream().finish();
@ -1656,7 +1651,6 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
              });
    auto& engine = get_engine();
    int64_t host_alloc = 0;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    // just to prevent the memories from being freed during allocation
    std::unordered_set<memory::ptr> allocated_mem_ptrs;
    for (const auto& node : nodes_to_allocate) {
@ -1676,10 +1670,8 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
        if (engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu)
            total_host_alloc_size += engine.get_used_device_memory(allocation_type::usm_device);
        if ((cur_vmem != -1 && total_host_alloc_size > cur_vmem * 0.5) || (total_host_alloc_size >= max_global_mem_size)) {
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory ("
-                    << cur_vmem << ")" << std::endl;
-            }
+            GPU_DEBUG_INFO << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory ("
+                           << cur_vmem << ")" << std::endl;
            return {-1L, -1L};
        }
        #endif
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@ -246,23 +246,18 @@ bool program_node::is_detached(bool whole_branch) {
 }

 layout program_node::calc_output_layout() const {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
    bool allow_new_shape_infer =
        get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
    if (allow_new_shape_infer) {
        auto out_layouts = type()->calc_output_layouts(*this, *get_kernel_impl_params());
        if (!out_layouts.empty()) {
-            GPU_DEBUG_IF(debug_config->verbose >= 4) {
-                GPU_DEBUG_COUT << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl;
-            }
+            GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl;
            return out_layouts[0];
        }
    }

    auto res = type()->calc_output_layout(*this, *get_kernel_impl_params());
-    GPU_DEBUG_IF(debug_config->verbose >= 4) {
-        GPU_DEBUG_COUT << id() << ": calc_output_layout:" << res << std::endl;
-    }
+    GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout:" << res << std::endl;

    return res;
 }
@ -528,8 +523,6 @@ bool program_node::has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& a

 dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr,
                                                   bool& optimization_is_completed) {
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-
    // Create new dnnl::post_ops object which will be filled inside the optimization process
    dnnl::post_ops optimized_p_ops;

@ -673,22 +666,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
    int64_t prev_post_op_idx = 0;
    bool optimization_done = false;

-    GPU_DEBUG_IF(debug_config->verbose >= 3) {
-        GPU_DEBUG_COUT << "================================================" << std::endl;
-        GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
-        for (size_t i = 0; i < cur_post_ops.size(); i++)
-            GPU_DEBUG_COUT << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
-    }
+    GPU_DEBUG_TRACE << "================================================" << std::endl;
+    GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
+    for (size_t i = 0; i < cur_post_ops.size(); i++)
+        GPU_DEBUG_TRACE << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;

    remove_optimized_prefix(cur_post_ops);

-    GPU_DEBUG_IF(debug_config->verbose >= 3) {
-        GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl;
-        GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
-        for (size_t i = 0; i < cur_post_ops.size(); i++)
-            GPU_DEBUG_COUT << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
-        GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl;
-    }
+    GPU_DEBUG_TRACE << "remove optimized prefix ------------------------" << std::endl;
+    GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
+    for (size_t i = 0; i < cur_post_ops.size(); i++)
+        GPU_DEBUG_TRACE << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
+    GPU_DEBUG_TRACE << "----------------------------------->>>>>>>>>>>>>" << std::endl;

    // Get post-ops size for current node
    int64_t post_ops_size = cur_post_ops.size();
@ -710,8 +699,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
        auto cur_type = cur_post_ops[cur_post_op_idx].op_type;
        auto prev_type = cur_post_ops[prev_post_op_idx].op_type;

-        GPU_DEBUG_IF(debug_config->verbose >= 3)
-            GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
+        GPU_DEBUG_TRACE << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;

        // Ignore optimized operations for "previous" operation in our operation pair
        while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) {
@ -728,8 +716,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
            cur_type = cur_post_ops[cur_post_op_idx].op_type;
        }

-        GPU_DEBUG_IF(debug_config->verbose >= 3)
-            GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
+        GPU_DEBUG_TRACE << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;

        auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
        auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
@ -766,10 +753,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const

        bool cur_ops_pair_is_optimized = false;

-        GPU_DEBUG_IF(debug_config->verbose >= 3) {
-            GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type
-                           << ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
-        }
+        GPU_DEBUG_TRACE << "prev_idx: " << prev_idx << " " << prev_type
+                         << ", cur_idx: " << cur_idx << " " << cur_type << std::endl;

        if (can_try_optimize) {
            if (eltw_and_eltw) {
@ -1005,12 +990,10 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
        remove_optimized_prefix(cur_post_ops);
    }

-    GPU_DEBUG_IF(debug_config->verbose >= 3) {
-        GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
-        for (size_t i = 0; i < cur_post_ops.size(); i++)
-            GPU_DEBUG_COUT << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
-        GPU_DEBUG_COUT << "------------------------------------------------" << std::endl;
-    }
+    GPU_DEBUG_TRACE << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
+    for (size_t i = 0; i < cur_post_ops.size(); i++)
+        GPU_DEBUG_TRACE << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
+    GPU_DEBUG_TRACE << "------------------------------------------------" << std::endl;

    add_onednn_fused_primitives(cur_post_ops);

--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp
@ -107,12 +107,8 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
            }
        } catch (std::runtime_error& ex) {
            // we have to handle it in order to avoid exception in KernelSelector as much we can
-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 3) {
-                kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
-                GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
-                    << kernelName << " - " << ex.what() << std::endl;
-            }
+            kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
+            GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
        }
    }

@ -202,12 +198,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
                }
            } catch (std::runtime_error& ex) {
                // we have to handle it in order to avoid exception in KernelSelector as much we can
-                GPU_DEBUG_GET_INSTANCE(debug_config);
-                GPU_DEBUG_IF(debug_config->verbose >= 3) {
-                    kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
-                    GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
-                        << kernelName << " - " << ex.what() << std::endl;
-                }
+                kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
+                GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
            }
        }
    }
@ -231,12 +223,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
                    }
                } catch (std::runtime_error& ex) {
                    // we have to handle it in order to avoid exception in KernelSelector as much we can
-                    GPU_DEBUG_GET_INSTANCE(debug_config);
-                    GPU_DEBUG_IF(debug_config->verbose >= 3) {
-                        kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
-                        GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
-                            << kernelName << " - " << ex.what() << std::endl;
-                    }
+                    kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
+                    GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
                }
            }
        }
--- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp
@ -493,7 +493,7 @@ void InferRequest::enqueue() {
    // If dump layers path is set, only runs first inference.
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
-        GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
+        GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl;
        exit(0);
    }
 }
@ -760,10 +760,7 @@ void InferRequest::allocate_inputs() {
            OPENVINO_ASSERT(litr != inputLayouts.end(), "[GPU] Input layout for ", name, " is not found");
            const auto input_layout = litr->second;

-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
-            }
+            GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl;
            if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
                TensorDesc desc_fp32 = desc;
                desc_fp32.setPrecision(Precision::FP32);
@ -796,10 +793,7 @@ void InferRequest::allocate_outputs() {
        if (output_layout.is_static())
            desc.setDims(m_graph->GetOutputSize(no.first));

-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl;

        outputsMap[no.first] = outputID;
        if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 ||
--- a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp
+++ b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp
@ -769,7 +769,7 @@ void InferRequestLegacy::enqueue() {
    // If dump layers path is set, only runs first inference.
    GPU_DEBUG_GET_INSTANCE(debug_config);
    GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
-        GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
+        GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl;
        exit(0);
    }
 }
@ -1031,10 +1031,7 @@ void InferRequestLegacy::allocate_inputs() {
                IE_THROW() << "Input layout for " << name << " is not found";
            }

-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
-            }
+            GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl;
            if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
                TensorDesc desc_fp32 = desc;
                desc_fp32.setPrecision(Precision::FP32);
@ -1088,10 +1085,7 @@ void InferRequestLegacy::allocate_outputs() {
        // Can be removed once 76176 is resolved.
        desc.setDims(m_graph->GetOutputSize(no.first));

-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl;

        outputsMap[no.first] = outputID;
        if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 ||
--- a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
@ -201,10 +201,7 @@ void createClDnnConstant(Program& p, const ngraph::Shape& constDims, const std::
        p.primitive_ids[initialconstPrimID] = constPrimID;
        p.profiling_ids.push_back(initialconstPrimID);
    } else {
-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << initialconstPrimID << ": constant]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl;
        cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false);
        auto& stream = p.GetEngine().get_program_stream();
        cldnn::mem_lock<char> lock{mem, stream};
--- a/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp
@ -73,10 +73,7 @@ static void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr<ngr
            cldnn::format::get_default_format(op->get_output_shape(1).size()),
            tensor_from_dims(op->get_output_shape(1)));

-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
        shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout));

        cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";
--- a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp
@ -93,7 +93,6 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
        auto outputIndices = op->get_output_partial_shape(0)[0].get_length();

        std::vector<cldnn::memory::ptr> shared_memory;
-        GPU_DEBUG_GET_INSTANCE(debug_config);
        switch (num_outputs) {
            case 3: {
                auto mutable_precision_second = op->get_output_element_type(2);
@ -105,9 +104,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
                    cldnn::format::get_default_format(op->get_output_shape(2).size()),
                    tensor_from_dims(op->get_output_shape(2)));

-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
-                }
+                GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
                shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));

                cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
@ -123,9 +120,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
                    cldnn::format::bfyx,
                    cldnn::tensor(static_cast<int32_t>(outputIndices), 3, 1, 1));

-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
-                }
+                GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
                shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));

                cldnn::primitive_id non_max_supression_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";
--- a/src/plugins/intel_gpu/src/plugin/ops/proposal.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/proposal.cpp
@ -64,10 +64,7 @@ static void CreateProposalOp(Program& p, const std::shared_ptr<ngraph::op::v0::P
                                                    cldnn::format::get_default_format(op->get_output_shape(1).size()),
                                                    tensor_from_dims(op->get_output_shape(1)));

-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
        auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);

        cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write";
--- a/src/plugins/intel_gpu/src/plugin/ops/topk.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/topk.cpp
@ -65,10 +65,7 @@ static void CreateTopKOp(Program& p, const std::shared_ptr<ngraph::op::v1::TopK>
                                                        cldnn::format::get_default_format(op->get_output_shape(1).size()),
                                                        tensor_from_dims(op->get_output_shape(1)));

-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
-            }
+            GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
            auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);

            cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write";
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@ -686,25 +686,20 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
        auto closest_pow_of_2 = [] (float x) {
            return pow(2, floor(std::log(x)/std::log(2)));
        };
-        GPU_DEBUG_GET_INSTANCE(debug_config);
        auto model_param = options.find(ov::hint::model.name());
        if (model_param == options.end()) {
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
-            }
+            GPU_DEBUG_INFO << "[OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
            return decltype(ov::optimal_batch_size)::value_type {static_cast<unsigned int>(1)};
        }
        std::shared_ptr<ngraph::Function> model;
        try {
            model = model_param->second.as<std::shared_ptr<ngraph::Function>>();
        } catch (...) {
-            IE_THROW() << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr<ov::Model> type";
-        }
-        GPU_DEBUG_IF(debug_config->verbose >= 1) {
-            GPU_DEBUG_COUT << "DEVICE_INFO:"
-                           << "gfx_version.major, " << device_info.gfx_ver.major
-                           << "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
+            IE_THROW() << "[OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr<ov::Model> type";
        }
+        GPU_DEBUG_INFO << "DEVICE_INFO:"
+                       << "gfx_version.major, " << device_info.gfx_ver.major
+                       << "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
        static std::map<cldnn::gfx_version, size_t> gen_kbytes_per_bank = {
                {{12, 0, 0}, 480},  // TGL
                {{12, 1, 0}, 2048}, // DG1
@ -722,14 +717,12 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
                                       ? next_pow_of_2(device_info.num_sub_slices_per_slice)
                                       : 2 * device_info.num_sub_slices_per_slice;
            L3_cache_size = kbytes_per_bank * 1024 * num_banks_per_slice * device_info.num_slices;
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "DEVICE_INFO:"
-                               << "num_slices " << device_info.num_slices
-                               << ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
-                               << ", num_banks_per_slice " << num_banks_per_slice
-                               << ", gen_kbytes_per_bank : " << kbytes_per_bank
-                               << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
-            }
+            GPU_DEBUG_INFO << "DEVICE_INFO:"
+                           << "num_slices " << device_info.num_slices
+                           << ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
+                           << ", num_banks_per_slice " << num_banks_per_slice
+                           << ", gen_kbytes_per_bank : " << kbytes_per_bank
+                           << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
        }
        Config config = _impl->m_configs.GetConfig(device_id);
        auto networkCloned = CloneAndTransformNetwork(CNNNetwork(model), config);
@ -744,11 +737,9 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
        unsigned int closest = closest_pow_of_2(max_batch_size);
        batch = std::min(closest, batch);
        batch = std::min(256u, batch); //batch 256 is a max
-        GPU_DEBUG_IF(debug_config->verbose >= 1) {
-            GPU_DEBUG_COUT << memPressure.max_mem_tolerance << std::endl;
-            GPU_DEBUG_COUT << "MAX_BATCH: " << max_batch_size << std::endl;
-            GPU_DEBUG_COUT << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
-        }
+        GPU_DEBUG_INFO << memPressure.max_mem_tolerance << std::endl;
+        GPU_DEBUG_INFO << "MAX_BATCH: " << max_batch_size << std::endl;
+        GPU_DEBUG_INFO << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
        return decltype(ov::optimal_batch_size)::value_type {batch};
    } else if (name == ov::device::uuid) {
        ov::device::UUID uuid = {};
@ -815,17 +806,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
        }

        int64_t available_device_mem = device_info.max_global_mem_size - occupied_device_mem;
-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
-                           << " (occupied: " << occupied_device_mem << ")" << std::endl;
-        }
+        GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
+                      << " (occupied: " << occupied_device_mem << ")" << std::endl;

        int64_t max_batch_size = 1;

        if (options.find(ov::hint::model.name()) == options.end()) {
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
-            }
+            GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
            return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
        }

@ -849,17 +836,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
            }
        }

-        GPU_DEBUG_IF(debug_config->verbose >= 2) {
-            GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;
-        }
+        GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;

        auto available_device_mem_it = options.find(ov::intel_gpu::hint::available_device_mem.name());
        if (available_device_mem_it != options.end()) {
            if (available_device_mem_it->second.is<int64_t>()) {
                available_device_mem = std::min(static_cast<int64_t>(available_device_mem), available_device_mem_it->second.as<int64_t>());
-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
-                }
+                GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
            } else {
                IE_THROW() << "[GPU_MAX_BATCH_SIZE] bad casting: ov::intel_gpu::hint::available_device_mem should be int64_t type";
            }
@ -910,9 +893,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
                const auto& shape = input->get_partial_shape();
                // currently no plugin support batched execution for dynamic networks
                if (shape.is_dynamic()) {
-                    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                        GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl;
-                    }
+                    GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl;
                    return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
                }

@ -921,10 +902,8 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
                        if (ov::DimensionTracker::get_label(shape[s])) {
                            // batched dim for the input
                            auto batched_input_id = ngraph::op::util::get_ie_output_name(params[input_id]->output(0));
-                            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                                GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id
-                                               << "[" << s << "]" << std::endl;
-                            }
+                            GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id
+                                          << "[" << s << "]" << std::endl;
                            batched_inputs.insert(std::make_pair(batched_input_id, s));
                        }
                    }
@ -932,9 +911,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
            }

            if (!batched_inputs.size()) {
-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl;
-                }
+                GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl;
                return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
            }

@ -944,9 +921,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
                    shapes[input.first][input.second] = base_batch_size;
                cloned_network.reshape(shapes);
            } catch (...) {
-                GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                    GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl;
-                }
+                GPU_DEBUG_INFO << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl;
                return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
            }

@ -962,15 +937,11 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
                    static_cast<int64_t>(static_cast<int64_t>(available_device_mem) - device_memory_usage.first));
            int64_t mem_per_batch = std::max(static_cast<int64_t>(1L), (device_memory_usage.second / static_cast<int64_t>(base_batch_size)));
            max_batch_size = mem_for_general / (mem_per_batch * static_cast<int64_t>(n_streams));
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size  << std::endl;
-                GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first  << std::endl;
-                GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second  << std::endl;
-            }
+            GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size  << std::endl;
+            GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first  << std::endl;
+            GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second  << std::endl;
        } catch (std::exception& e) {
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl;
-            }
+            GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl;
        }
        return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
    } else if (isModelCachingEnabled && name == METRIC_KEY(IMPORT_EXPORT_SUPPORT)) {
--- a/src/plugins/intel_gpu/src/plugin/program.cpp
+++ b/src/plugins/intel_gpu/src/plugin/program.cpp
@ -375,11 +375,8 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st

 void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ngraph::Node>& op) {
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::CreateSingleLayerPrimitive");
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
-                       << "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
-    }
+    GPU_DEBUG_LOG << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
+                  << "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;

    bool is_created = false;
    const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -17,6 +17,8 @@ const char *debug_configuration::prefix = "GPU_Debug: ";

 #ifdef GPU_DEBUG_CONFIG

+#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix
+
 template<typename T>
 void print_option(std::string option_name, T option_value) {
    GPU_DEBUG_COUT << "Config " << option_name << " = " << option_value << std::endl;
--- a/src/plugins/intel_gpu/src/runtime/engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/engine.cpp
@ -265,11 +265,8 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
    default:
        throw std::runtime_error("Invalid engine type");
    }
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 1) {
-        const auto& info = device->get_info();
-        GPU_DEBUG_COUT << "Selected Device: " << info.dev_name << std::endl;
-    }
+    const auto& info = device->get_info();
+    GPU_DEBUG_INFO << "Selected Device: " << info.dev_name << std::endl;
    return ret;
 }

--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@ -299,12 +299,9 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
            dump_file << "*/\n";
    }
    if (!err_log.empty()) {
-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose) {
-            std::cout << "-------- OpenCL build error" << std::endl;
-            std::cout << err_log << std::endl;
-            std::cout << "-------- End of OpenCL build error" << std::endl;
-        }
+        GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl;
+        GPU_DEBUG_INFO << err_log << std::endl;
+        GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl;
        std::stringstream err_ss(err_log);
        std::string line;
        int cnt = 0;
--- a/src/plugins/intel_gpu/src/runtime/memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory.cpp
@ -21,12 +21,9 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, bool
    : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), _type(type), _reused(reused) {
    if (!_reused && _engine) {
        _engine->add_memory_used(_bytes_count, type);
-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 1) {
-            GPU_DEBUG_COUT << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
-                           << " (current=" << _engine->get_used_device_memory(type) << ";"
-                           << " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
-        }
+        GPU_DEBUG_LOG << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
+                      << " (current=" << _engine->get_used_device_memory(type) << ";"
+                      << " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
    }
 }

@ -35,12 +32,9 @@ memory::~memory() {
        try {
            _engine->subtract_memory_used(_bytes_count, _type);
        } catch (...) {}
-        GPU_DEBUG_GET_INSTANCE(debug_config);
-        GPU_DEBUG_IF(debug_config->verbose >= 1) {
-            GPU_DEBUG_COUT << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
-                           << " (current=" << _engine->get_used_device_memory(_type) << ";"
-                           << " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
-        }
+        GPU_DEBUG_LOG << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
+                      << " (current=" << _engine->get_used_device_memory(_type) << ";"
+                      << " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
    }
 }

--- a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
+++ b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@ -137,10 +137,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
            ++it;
        }
    }
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl;
-    }
+    GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
    // didn't find anything for you? create new resource
    auto mem = alloc_memory(layout, type);
    {
@ -179,10 +176,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
            memory_record({{id, network_id}}, mem, network_id, type));
        return mem;
    }
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    GPU_DEBUG_IF(debug_config->verbose >= 2) {
-        GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl;
-    }
+    GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
    auto mem = alloc_memory(layout, type);
    std::list<memory_record> list = {memory_record({{id, network_id}}, mem, network_id, type)};
    _padded_pool.emplace(layout, std::move(list));
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_command_queues_builder.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_command_queues_builder.cpp
@ -76,10 +76,7 @@ std::vector<cl_queue_properties> command_queues_builder::get_properties(const cl
        using cmp_t = std::common_type<decltype(queue_properties), typename std::underlying_type<cl::QueueProperties>::type>::type;
        if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
            out_of_order = false;
-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                GPU_DEBUG_COUT << "Requested out-of-order queue is not supported by current device. Use in-order instead";
-            }
+            GPU_DEBUG_INFO << "Requested out-of-order queue is not supported by current device. Use in-order instead\n";
        }
    }

--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp
@ -180,10 +180,10 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
        GPU_DEBUG_GET_INSTANCE(debug_config);
        GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
            if (period.stage == instrumentation::profiling_stage::executing) {
-                GPU_DEBUG_COUT << "Multi-kernel time: ";
+                GPU_DEBUG_TRACE << "Multi-kernel time: ";
                for (auto& duration : all_durations[period.stage])
-                    std::cout << "  " << (duration.second - duration.first) / 1000;
-                std::cout << " Total " << sum / 1000 << std::endl;
+                    GPU_DEBUG_TRACE << "  " << (duration.second - duration.first) / 1000;
+                GPU_DEBUG_TRACE << " Total " << sum / 1000 << std::endl;
            }
        }

--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
@ -338,10 +338,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type) {
            if (type != mem_lock_type::read) {
                throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type.");
            }
-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                GPU_DEBUG_COUT << "Copy usm_device buffer to host buffer." << std::endl;
-            }
+            GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
            _host_buffer.allocateHost(_bytes_count);
            cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, CL_TRUE);
            _mapped_ptr = _host_buffer.get();
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
@ -6,6 +6,7 @@
 #include "ocl_event.hpp"
 #include "ocl_user_event.hpp"
 #include "ocl_command_queues_builder.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
 #include "ocl_kernel.hpp"
 #include "ocl_common.hpp"

@ -49,6 +50,27 @@ inline cl::NDRange toNDRange(const std::vector<size_t>& v) {
    }
 }

+cl_int set_kernel_arg(ocl_kernel_type& kernel, uint32_t idx, cldnn::memory::cptr mem) {
+    if (!mem)
+        return CL_INVALID_ARG_VALUE;
+
+    if (mem->get_layout().format.is_image_2d()) {
+        auto buf = std::dynamic_pointer_cast<const ocl::gpu_image2d>(mem)->get_buffer();
+        GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (image) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
+        return kernel.setArg(idx, buf);
+    } else if (memory_capabilities::is_usm_type(mem->get_allocation_type())) {
+        auto buf = std::dynamic_pointer_cast<const ocl::gpu_usm>(mem)->get_buffer();
+        GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (usm) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
+        return kernel.setArgUsm(idx, buf);
+    } else {
+        auto buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(mem)->get_buffer();
+        GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (buffer) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
+        return kernel.setArg(idx, buf);
+    }
+
+    return CL_INVALID_ARG_VALUE;
+}
+
 void set_arguments_impl(ocl_kernel_type& kernel,
                        const arguments_desc& args,
                        const kernel_arguments_data& data) {
@ -59,121 +81,44 @@ void set_arguments_impl(ocl_kernel_type& kernel,
        switch (args[i].t) {
            case args_t::INPUT:
                if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) {
-                    const auto& input_mem = data.inputs[args[i].index];
-                    if (input_mem) {
-                        if (input_mem->get_layout().format.is_image_2d())
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(input_mem)->get_buffer());
-                        else if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
-                            status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
-                        else
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
-                    }
+                    status = set_kernel_arg(kernel, i, data.inputs[args[i].index]);
                }
                break;
            case args_t::INPUT_OF_FUSED_PRIMITIVE:
                if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) {
-                    const auto& input_mem = data.fused_op_inputs[args[i].index];
-                    if (input_mem) {
-                        if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
-                            status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
-                        else
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
-                    }
+                    status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]);
                }
                break;
            case args_t::INTERNAL_BUFFER:
                if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) {
-                    const auto& input_mem = data.intermediates[args[i].index];
-                    if (input_mem) {
-                        if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
-                            status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
-                        else
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
-                    }
+                    status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]);
                }
                break;
            case args_t::OUTPUT:
                if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) {
-                    const auto& output_mem = data.outputs[args[i].index];
-                    if (output_mem) {
-                         if (output_mem->get_layout().format.is_image_2d())
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(output_mem)->get_buffer());
-                         else if (memory_capabilities::is_usm_type(output_mem->get_allocation_type()))
-                             status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(output_mem)->get_buffer());
-                         else
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(output_mem)->get_buffer());
-                    }
+                    status = set_kernel_arg(kernel, i, data.outputs[args[i].index]);
                }
                break;
            case args_t::WEIGHTS:
-                if (data.weights) {
-                    if (data.weights->get_layout().format.is_image_2d())
-                        status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(data.weights)->get_buffer());
-                    else if (memory_capabilities::is_usm_type(data.weights->get_allocation_type()))
-                        status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights)->get_buffer());
-                    else
-                        status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.weights);
                break;
            case args_t::BIAS:
-                if (data.bias) {
-                    if (memory_capabilities::is_usm_type(data.bias->get_allocation_type()))
-                        status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.bias)->get_buffer());
-                    else
-                        status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.bias)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.bias);
                break;
            case args_t::WEIGHTS_ZERO_POINTS:
-                if (data.weights_zero_points) {
-                    if (memory_capabilities::is_usm_type(data.weights_zero_points->get_allocation_type()))
-                        status = kernel.setArgUsm(
-                            i,
-                            std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights_zero_points)->get_buffer());
-                    else
-                        status = kernel.setArg(
-                            i,
-                            std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights_zero_points)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.weights_zero_points);
                break;
            case args_t::ACTIVATIONS_ZERO_POINTS:
-                if (data.activations_zero_points) {
-                    if (memory_capabilities::is_usm_type(data.activations_zero_points->get_allocation_type()))
-                        status = kernel.setArgUsm(
-                            i,
-                            std::dynamic_pointer_cast<const ocl::gpu_usm>(data.activations_zero_points)->get_buffer());
-                    else
-                        status = kernel.setArg(
-                            i,
-                            std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.activations_zero_points)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.activations_zero_points);
                break;
            case args_t::COMPENSATION:
-                if (data.compensation) {
-                    if (memory_capabilities::is_usm_type(data.compensation->get_allocation_type()))
-                        status = kernel.setArgUsm(
-                                i,
-                                std::dynamic_pointer_cast<const ocl::gpu_usm>(data.compensation)->get_buffer());
-                    else
-                        status = kernel.setArg(
-                                 i,
-                                 std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.compensation)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.compensation);
                break;
            case args_t::SCALE_TABLE:
-                if (data.scale_table) {
-                    if (memory_capabilities::is_usm_type(data.scale_table->get_allocation_type()))
-                        status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.scale_table)->get_buffer());
-                    else
-                        status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.scale_table)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.scale_table);
                break;
            case args_t::SLOPE:
-                if (data.slope) {
-                    if (memory_capabilities::is_usm_type(data.slope->get_allocation_type()))
-                        status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.slope)->get_buffer());
-                    else
-                        status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.slope)->get_buffer());
-                }
+                status = set_kernel_arg(kernel, i, data.slope);
                break;
            case args_t::SPLIT:
                status = kernel.setArg(i, data.split);
@ -217,48 +162,17 @@ void set_arguments_impl(ocl_kernel_type& kernel,
                    }
                }
                break;
-            case args_t::RECURRENT:  // RNN/LSTM/GRU layers
-                if (data.recurrent) {
-                    if (data.recurrent->get_layout().format.is_image_2d())
-                        status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.recurrent).get_buffer());
-                    else if (memory_capabilities::is_usm_type(data.recurrent->get_allocation_type()))
-                        status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.recurrent).get_buffer());
-                    else
-                        status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.recurrent).get_buffer());
-                }
+            case args_t::RECURRENT:
+                status = set_kernel_arg(kernel, i, data.recurrent);
                break;
-            case args_t::HIDDEN:  // RNN/LSTM/GRU layers
-                if (data.hidden) {
-                    if (data.hidden->get_layout().format.is_image_2d())
-                        status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.hidden).get_buffer());
-                    else if (memory_capabilities::is_usm_type(data.hidden->get_allocation_type()))
-                        status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.hidden).get_buffer());
-                    else
-                        status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.hidden).get_buffer());
-                }
+            case args_t::HIDDEN:
+                status = set_kernel_arg(kernel, i, data.hidden);
                break;
-            case args_t::CELL:  // LSTMlayers
-                if (data.cell) {
-                    if (data.cell->get_layout().format.is_image_2d())
-                        status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.cell).get_buffer());
-                    else if (memory_capabilities::is_usm_type(data.cell->get_allocation_type()))
-                        status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.cell).get_buffer());
-                    else
-                        status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.cell).get_buffer());
-                }
+            case args_t::CELL:
+                status = set_kernel_arg(kernel, i, data.cell);
                break;
            case args_t::SHAPE_INFO:
-                if (args[i].index == 0 && data.shape_info) {
-                    const auto& shape_info_mem = data.shape_info;
-                    if (shape_info_mem) {
-                        if (shape_info_mem->get_layout().format.is_image_2d())
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(shape_info_mem)->get_buffer());
-                        else if (memory_capabilities::is_usm_type(shape_info_mem->get_allocation_type()))
-                            status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(shape_info_mem)->get_buffer());
-                        else
-                            status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(shape_info_mem)->get_buffer());
-                    }
-                }
+                status = set_kernel_arg(kernel, i, data.shape_info);
                break;
            default:
                break;
@ -359,6 +273,7 @@ void ocl_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args
    auto& kern = ocl_kernel.get_handle();

    try {
+        GPU_DEBUG_TRACE_DETAIL << "Set arguments for primitive: " << args_desc.layerID << " (" << kern.get() << ")\n";
        set_arguments_impl(kern, args_desc.arguments, args);
    } catch (cl::Error const& err) {
        throw ocl_error(err);