diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index 337cb04c420..91806b6ff89 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -7,22 +7,63 @@ #include #include +namespace ov { +namespace intel_gpu { + +// Verbose log levels: +// DISABLED - silent mode (Default) +// INFO - Minimal verbose: +// * May log basic info about device, plugin configuration, model and execution statistics +// * Mustn't log any info that depend on neither number of iterations or number of layers in the model +// * Minimal impact on both load time and inference time +// LOG - Enables graph optimization verbose: +// * Includes info from log level INFO +// * May log info about applied graph transformations, memory allocations and other model compilation time steps +// * May impact compile_model() execution time +// * Minimal impact on inference time +// TRACE - Enables basic execution time verbose +// * Includes info from log level LOG +// * May log info during model execution +// * May log short info about primitive execution +// * May impact network execution time +// TRACE_DETAIL - Max verbosity +// * Includes info from log level TRACE +// * May log any stage and print detailed info about each execution step +enum class LogLevel : int8_t { + DISABLED = 0, + INFO = 1, + LOG = 2, + TRACE = 3, + TRACE_DETAIL = 4 +}; + +} // namespace intel_gpu +} // namespace ov + #ifdef GPU_DEBUG_CONFIG #define GPU_DEBUG_IF(cond) if (cond) #define GPU_DEBUG_PROFILED_STAGE(stage) \ auto stage_prof = cldnn::instrumentation::profiled_stage(\ !cldnn::debug_configuration::get_instance()->dump_profiling_data.empty(), *this, stage) #define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) stage_prof.set_cache_hit(val) + +#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \ + std::cout << cldnn::debug_configuration::prefix +#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast::type>(min_verbose_level)) #else #define GPU_DEBUG_IF(cond) if (0) #define GPU_DEBUG_PROFILED_STAGE(stage) #define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) +#define GPU_DEBUG_LOG_RAW(min_verbose_level) if (0) std::cout << cldnn::debug_configuration::prefix #endif -#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix // Macro below is inserted to avoid unused variable warning when GPU_DEBUG_CONFIG is OFF #define GPU_DEBUG_GET_INSTANCE(name) auto name = cldnn::debug_configuration::get_instance(); (void)(name); +#define GPU_DEBUG_INFO GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::INFO) +#define GPU_DEBUG_LOG GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::LOG) +#define GPU_DEBUG_TRACE GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE) +#define GPU_DEBUG_TRACE_DETAIL GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE_DETAIL) namespace cldnn { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 97cc6b9d772..c0209b1a3ce 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -22,10 +22,8 @@ using namespace cldnn; -#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \ - GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; } -#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \ - GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Replace node: " << (id) << std::endl; } +#define LOG_NODE_REMOVAL(id) GPU_DEBUG_LOG_PASS << "Remove node: " << (id) << std::endl; +#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_LOG_PASS << "Replace node: " << (id) << std::endl; remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations, @@ -34,7 +32,6 @@ remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, b remove_output_reorders(remove_output_reorders) {} void remove_redundant_reorders::run(program& p) { - GPU_DEBUG_GET_INSTANCE(debug_config); auto update_implementation = [&](program_node& node) { if (!update_implementations) return; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 794abdfd072..63673c844dd 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -34,8 +34,6 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); } namespace { std::map get_preferred_formats(program& p, layout_optimizer& lo) { - GPU_DEBUG_GET_INSTANCE(debug_config); - std::map fmt_map; #ifdef ENABLE_ONEDNN_FOR_GPU @@ -59,9 +57,7 @@ std::map get_preferred_formats(program& p, layout_o if (onednn_impls_counter < 1 && lo.get_optimization_attributes().use_onednn_impls) { should_update_fmt_map = true; lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl; - } + GPU_DEBUG_LOG << "Disable oneDNN implementations globally" << std::endl; } if (should_update_fmt_map) @@ -277,10 +273,7 @@ void propagate_formats_rec(std::map& fmt_map, fmt = travel_direction_wrapper::first(first_fmt, second_fmt); fmt_map.at(node) = fmt; - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl; - } + GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl; for (auto next : travel_direction_wrapper::next_nodes(node)) { if (!next->is_in_data_flow()) @@ -312,10 +305,7 @@ void propagate_formats_rec(std::map::first(first_fmt, second_fmt); fmt_map.at(node) = fmt; - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl; - } + GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl; for (auto next : travel_direction_wrapper::next_nodes(node)) { if (!next.first->is_in_data_flow()) @@ -541,7 +531,6 @@ const char *dir_msg(direction_e dir) { // If there is layout mismatch between two layers, add reorder template void insert_reorders_in_dir(program& p, const std::map& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) { - GPU_DEBUG_GET_INSTANCE(debug_config); auto fmt = fmt_map.at(node); auto next_cpy = travel_direction_wrapper::next_nodes(node); @@ -565,10 +554,8 @@ void insert_reorders_in_dir(program& p, const std::mapverbose >= 2) { - GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## " - << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl; - } + GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## " + << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl; if (in_layout.format == format::any || out_layout.format == format::any) continue; @@ -580,10 +567,8 @@ void insert_reorders_in_dir(program& p, const std::mapverbose >= 2) { - GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id() - << " Reorder is added" << std::endl; - } + GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id() + << " Reorder is added" << std::endl; p.add_intermediate(reorder_node, *travel_direction_wrapper::second(node, next), *travel_direction_wrapper::first(node, next), @@ -595,7 +580,6 @@ void insert_reorders_in_dir(program& p, const std::map void insert_reorders_in_dir(program& p, const std::map& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) { - GPU_DEBUG_GET_INSTANCE(debug_config); auto fmt = fmt_map.at(node); auto next_cpy = travel_direction_wrapper::next_nodes(node); @@ -619,11 +603,9 @@ void insert_reorders_in_dir(program& p, const std::map

verbose >= 2) { - GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) - << " " << node->id() << " --> " << next.first->id() << " ## " - << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl; - } + GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) + << " " << node->id() << " --> " << next.first->id() << " ## " + << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl; if (in_layout.format == format::any || out_layout.format == format::any) continue; @@ -635,10 +617,8 @@ void insert_reorders_in_dir(program& p, const std::map

verbose >= 2) { - GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << " " << reorder_node.id() - << " Reorder is added" << std::endl; - } + GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << " " << reorder_node.id() + << " Reorder is added" << std::endl; p.add_intermediate(reorder_node, *travel_direction_wrapper::second(node, next.first), *travel_direction_wrapper::first(node, next.first), @@ -684,30 +664,26 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) auto fmt_map = get_preferred_formats(p, lo); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl; - for (auto& node_fmt : fmt_map) { - if (node_fmt.second != format::any) { - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl; - } + GPU_DEBUG_LOG_PASS << "Preferred formats:" << std::endl; + for (auto& node_fmt : fmt_map) { + if (node_fmt.second != format::any) { + GPU_DEBUG_LOG_PASS << " " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl; } } propagate_formats(p, fmt_map, lo); minimize_local_reorders(p, fmt_map, lo); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl; - for (auto node_ptr : p.get_processing_order()) { - if (fmt_map.count(node_ptr) == 0) - continue; + GPU_DEBUG_LOG_PASS << "Selected formats:" << std::endl; + for (auto node_ptr : p.get_processing_order()) { + if (fmt_map.count(node_ptr) == 0) + continue; - auto fmt = fmt_map.at(node_ptr); - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl; - } + auto fmt = fmt_map.at(node_ptr); + GPU_DEBUG_LOG_PASS << " " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl; } - GPU_DEBUG_IF(debug_config->verbose >= 1) { + GPU_DEBUG_IF(debug_config->verbose >= 2) { reorder_cnt total_reorder_count = std::accumulate( p.get_processing_order().begin(), p.get_processing_order().end(), @@ -719,8 +695,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes }; }); // Divide results by two as above function will each reorder from both sides - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl; - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl; + GPU_DEBUG_LOG_PASS << "Total number of reorders: " << total_reorder_count.number / 2 << std::endl; + GPU_DEBUG_LOG_PASS << "Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl; // Count number of reorders that will be fused size_t nodes_with_fusing = 0; @@ -736,8 +712,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) } } } - GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl; - GPU_DEBUG_COUT << "----------------------------------------------" << std::endl; + GPU_DEBUG_LOG_PASS << "Number of nodes with fused reorders: " << nodes_with_fusing << std::endl; + GPU_DEBUG_LOG_PASS << "----------------------------------------------" << std::endl; } insert_reorders(p, fmt_map, rf, lo); @@ -802,7 +778,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) } }; - const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node& conv_node) { + const auto reorder_convolution = [&p, &lo, &rf](typed_program_node& conv_node) { { // reorder weights convolution auto& weights = conv_node.weights(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp index 61695cc4e88..c9932a91e3f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp @@ -32,7 +32,6 @@ void select_preferred_formats::run(program& p) { return; #ifdef ENABLE_ONEDNN_FOR_GPU - GPU_DEBUG_GET_INSTANCE(debug_config); for (auto n : p.get_processing_order()) { // Onednn primitive descriptor creation may fail, for example, due to asymmetric weight. try { @@ -47,9 +46,7 @@ void select_preferred_formats::run(program& p) { _lo.select_preferred_formats_for_onednn(*n, prim_desc); } catch(std::exception &exception) { - GPU_DEBUG_IF(debug_config->verbose >= 1) { - std::cout << "WARNING(select_preferred_formats): " << exception.what() << std::endl; - } + GPU_DEBUG_INFO << "WARNING(select_preferred_formats): " << exception.what() << std::endl; } } #endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp index abba0259aeb..ed5b70b9830 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp @@ -68,18 +68,15 @@ protected: dnnl::memory::desc desc = onednn::layout_to_memory_desc(a_zp->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, a_zp->get_onednn_memory(desc)}); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 3) { - auto dnnl_mem = a_zp->get_onednn_memory(desc); - void *mapped_ptr = dnnl_mem.map_data(); - if (mapped_ptr) { - GPU_DEBUG_COUT << instance.id() << " activations_zero_points: "; - for (size_t i = 0; i < desc.get_size(); ++i) { - std::cout << static_cast(mapped_ptr)[i] << " "; - } - std::cout << std::endl; - dnnl_mem.unmap_data(mapped_ptr); + auto dnnl_mem = a_zp->get_onednn_memory(desc); + void *mapped_ptr = dnnl_mem.map_data(); + if (mapped_ptr) { + GPU_DEBUG_TRACE_DETAIL << instance.id() << " activations_zero_points: "; + for (size_t i = 0; i < desc.get_size(); ++i) { + GPU_DEBUG_TRACE_DETAIL << static_cast(mapped_ptr)[i] << " "; } + GPU_DEBUG_TRACE_DETAIL << std::endl; + dnnl_mem.unmap_data(mapped_ptr); } } @@ -88,18 +85,15 @@ protected: dnnl::memory::desc desc = onednn::layout_to_memory_desc(w_zp->get_layout(), dnnl::memory::format_tag::a, true); args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, w_zp->get_onednn_memory(desc)}); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 3) { - auto dnnl_mem = w_zp->get_onednn_memory(desc); - void *mapped_ptr = dnnl_mem.map_data(); - if (mapped_ptr) { - GPU_DEBUG_COUT << instance.id() << " weights_zero_points: "; - for (size_t i = 0; i < desc.get_size(); ++i) { - std::cout << static_cast(mapped_ptr)[i] << " "; - } - std::cout << std::endl; - dnnl_mem.unmap_data(mapped_ptr); + auto dnnl_mem = w_zp->get_onednn_memory(desc); + void *mapped_ptr = dnnl_mem.map_data(); + if (mapped_ptr) { + GPU_DEBUG_TRACE_DETAIL << instance.id() << " weights_zero_points: "; + for (size_t i = 0; i < desc.get_size(); ++i) { + GPU_DEBUG_TRACE_DETAIL << static_cast(mapped_ptr)[i] << " "; } + GPU_DEBUG_TRACE_DETAIL << std::endl; + dnnl_mem.unmap_data(mapped_ptr); } } diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index c305e98595f..fec91bdb680 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -23,6 +23,8 @@ #include +#define GPU_DEBUG_LOG_PASS GPU_DEBUG_LOG << "[" << get_name() << "] " + namespace cldnn { class base_pass { friend class pass_manager; diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index b031e7ade69..436db5228bc 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1350,8 +1350,8 @@ impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) preferred_type = impl_types::cpu; if (node.id() == forced_impl_type.substr(0, found_type)) { - GPU_DEBUG_COUT << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : " - << forced_impl_type.substr(found_type + 1) << std::endl; + GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : " + << forced_impl_type.substr(found_type + 1) << std::endl; return preferred_type; } } @@ -1819,7 +1819,6 @@ format layout_optimizer::get_preferred_format(program_node& node) { #ifdef ENABLE_ONEDNN_FOR_GPU void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc) { - GPU_DEBUG_GET_INSTANCE(debug_config); if (node.is_input() || !are_data_types_suitable_for_onednn(node)) { return; } @@ -1868,10 +1867,8 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d node.set_preferred_output_fmt(usr, dst_fmt); } - GPU_DEBUG_IF(debug_config->verbose >= 2) { - std::cout << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt) - << " For index : " << idx << std::endl; - } + GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt) + << " For index : " << idx << std::endl; } } diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index be43981c33e..e7036268e52 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -890,9 +890,8 @@ void network::execute_impl(const std::vector& events) { OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "NetworkImpl::Execute"); // Wait for previous execution completion reset_execution(false); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 1) - GPU_DEBUG_COUT << "----------------------------------------------" << std::endl; + GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl; + GPU_DEBUG_TRACE << "Start network execution" << std::endl; std::vector in_out_mem; auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) { @@ -928,6 +927,7 @@ void network::execute_impl(const std::vector& events) { auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream()); set_arguments(); + GPU_DEBUG_GET_INSTANCE(debug_config); for (auto& inst : _exec_order) { GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { const std::string layer_name = inst->id(); @@ -1120,10 +1120,7 @@ void network::allocate_primitive_instance(program_node const& node) { if (_primitives.count(node.id())) return; - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << node.id() << ": allocate primitive instance" << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << node.id() << ": allocate primitive instance" << std::endl; auto inst = node.type()->create_instance(*this, node); @@ -1191,10 +1188,7 @@ void network::transfer_memory_to_device(std::shared_ptr instance // Allocate and transfer memory auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false); device_mem->copy_from(get_stream(), inst_mem); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << node.id() << ": constant]" << std::endl; - } + GPU_DEBUG_LOG << "[" << node.id() << ": constant]" << std::endl; _memory_pool->release_memory(&inst_mem, node.id(), get_id()); instance->set_output_memory(device_mem); } diff --git a/src/plugins/intel_gpu/src/graph/pass_manager.cpp b/src/plugins/intel_gpu/src/graph/pass_manager.cpp index f217d3b6224..53bec3925a5 100644 --- a/src/plugins/intel_gpu/src/graph/pass_manager.cpp +++ b/src/plugins/intel_gpu/src/graph/pass_manager.cpp @@ -48,11 +48,13 @@ void pass_manager::run(program& p, base_pass& pass) { using ms = std::chrono::duration>; using Time = std::chrono::high_resolution_clock; + GPU_DEBUG_LOG << "Run pass " << pass.get_name() << std::endl; auto start = Time::now(); pass.run(p); auto stop = Time::now(); std::chrono::duration fs = stop - start; ms opt_pass_time = std::chrono::duration_cast(fs); + GPU_DEBUG_LOG << "Pass " << pass.get_name() << " execution time: " << opt_pass_time.count() << " ms" << std::endl; p.save_pass_info(pass.get_name()); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index e814dafa91d..b1868de9169 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -142,7 +142,6 @@ void primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t i } void primitive_inst::update_shape() { - GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference); bool input_shape_changed = false; @@ -199,9 +198,7 @@ void primitive_inst::update_shape() { // Events may be not created for in-order queue, so take them for OOO queue only if (_network.has_event(dep.id()) && queue_type == queue_types::out_of_order) { dependencies_events.push_back(_network.get_primitive_event(dep_id)); - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": shape infer waits for " << i << " dependency\n"; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": shape infer waits for " << i << " dependency\n"; } auto dep_mem = _network.get_output_memory(dep_id); memory_deps.insert({i, dep_mem}); @@ -221,10 +218,8 @@ void primitive_inst::update_shape() { auto update_output_layout = [&](layout& layout, size_t idx) { layout.data_padding = padding::max(_node->get_primitive()->output_paddings[idx], layout.data_padding); if (_impl_params->get_output_layout(idx) != layout) { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string() - << " now: " << layout.to_short_string() << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string() + << " now: " << layout.to_short_string() << std::endl; set_shape_change(); } _impl_params->output_layouts[idx] = layout; @@ -265,16 +260,12 @@ void primitive_inst::realloc_if_needed() { bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size; if (can_reuse_buffer) { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": reuse previously allocated output buffer" << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl; _outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout); } else { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": realloc output memory. " - << " Current buffer_size=" << max_output_layout_size - << " Requested buffer_size=" << actual_layout.count() << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. " + << " Current buffer_size=" << max_output_layout_size + << " Requested buffer_size=" << actual_layout.count() << std::endl; _outputs = allocate_outputs(&updated_params); // TODO : need to handle multiple outputs max_output_layout_size = updated_params.output_layouts[0].count(); @@ -285,7 +276,6 @@ void primitive_inst::realloc_if_needed() { void primitive_inst::update_impl() { GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation); - GPU_DEBUG_GET_INSTANCE(debug_config); auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr"; auto extend_to_6d = [this](ov::PartialShape ps) -> std::vector { // For shape_of we extend shape with 1-s to 6d rank to make kernel simpler @@ -321,7 +311,7 @@ void primitive_inst::update_impl() { return seed; }; - auto update_shape_info = [this, extend_to_6d, debug_config, prev_impl_str](const kernel_impl_params& params) { + auto update_shape_info = [this, extend_to_6d, prev_impl_str](const kernel_impl_params& params) { mem_lock lock(_shape_info_memory, _network.get_stream()); size_t offset = 0; for (size_t i = 0; i < _node->get_dependencies().size(); i++) { @@ -337,17 +327,14 @@ void primitive_inst::update_impl() { for (size_t j = 0; j < output_shape.size(); j++) lock[offset++] = static_cast(output_shape[j]); } - GPU_DEBUG_IF(debug_config->verbose >= 4) { - std::stringstream s; - s << "shapes: "; - for (size_t i = 0; i < offset; i++) - s << lock[i] << " "; - GPU_DEBUG_COUT << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl; - } + std::stringstream s; + s << "shapes: "; + for (size_t i = 0; i < offset; i++) + s << lock[i] << " "; + GPU_DEBUG_TRACE_DETAIL << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl; }; if (!_node->is_type() && !(_node->is_type() && _node->get_dependencies().empty())) { - GPU_DEBUG_GET_INSTANCE(debug_config); // Update param if fake_alignment is available auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params); auto layout_key = get_layout_key(updated_params); @@ -359,10 +346,7 @@ void primitive_inst::update_impl() { if (has_cached_impl) { _impl = cache.get(layout_key)->clone(); GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); - - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl; } } if (!has_cached_impl) { @@ -402,10 +386,9 @@ void primitive_inst::update_impl() { kernels_cache.reset(); std::lock_guard lock(get_network().get_impl_cache_mutex()); cache.add(layout_key, _impl->clone()); - GPU_DEBUG_IF(debug_config->verbose >= 4) { - auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr"; - GPU_DEBUG_COUT << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl; - } + + auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr"; + GPU_DEBUG_TRACE_DETAIL << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl; } } @@ -436,14 +419,9 @@ event::ptr primitive_inst::execute(const std::vector& events) { subgraph->set_input_data(d.first->id(), actual_mem); } } - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << "[Start] Executing unfused subgraph of " << id() << std::endl; - } - + GPU_DEBUG_TRACE_DETAIL << "[Start] Executing unfused subgraph of " << id() << std::endl; auto outputs = subgraph->execute(events); - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << "[End] Finished executing unfused subgraph of " << id() << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << "[End] Finished executing unfused subgraph of " << id() << std::endl; auto last_fd = _impl_params->fused_desc.back(); auto last_prim_id = last_fd.desc->id; @@ -476,28 +454,7 @@ event::ptr primitive_inst::execute(const std::vector& events) { } on_execute(); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - std::ostringstream in_addr; - // buffer_ptr() only support usm_memory - for (size_t i = 0; i < this->dependencies().size(); i++) { - auto in_mem = dep_memory_ptr(i); - if (in_mem) { - in_addr << in_mem->buffer_ptr(); - if (i < this->dependencies().size() - 1) { - in_addr << ", "; - } - } - } - auto out_mem = output_memory_ptr(); - auto out_alloc_type = out_mem ? out_mem->get_allocation_type() : allocation_type::unknown; - auto out_ptr = out_mem ? out_mem->buffer_ptr() : nullptr; - auto impl_name = _impl->get_kernel_name(); - - GPU_DEBUG_COUT << id() << ": execute " << impl_name << ". Memory type: " - << out_alloc_type << ", in_usm(" - << in_addr.str() << "), out_usm(" - << out_ptr << ")" << std::endl; - } + GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << std::endl; if (_exec_deps.empty() && dependencies.empty()) { dependencies = events; @@ -695,10 +652,7 @@ void primitive_inst::allocate_internal_buffers(void) { // allocate intermediate memory for the updated layout of buffer std::vector intermediates_memory; for (auto layout : ibuf_layouts) { - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _node->id() << ": internal buf]" << std::endl; - } + GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf]" << std::endl; auto alloc_type = allocation_type::unknown; if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) { alloc_type = engine.get_preferred_memory_allocation_type(); @@ -719,8 +673,6 @@ event::ptr primitive_inst::update_weights() { if (!weightable_node) return nullptr; - GPU_DEBUG_GET_INSTANCE(debug_config); - auto& weights_params = _impl->_weights_reorder_params; bool requires_reorder = weights_params.engine != kernel_selector::GenericKernelParams::Engine::NONE && (!_impl_params->reordered_weights || _impl_params->reordered_weights->get_layout() != from_weights_tensor(weights_params.dest)); @@ -744,17 +696,13 @@ event::ptr primitive_inst::update_weights() { if (layout_key != "") { auto& cache = get_network().get_in_mem_kernels_cache(); if (cache.has(layout_key)) { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout.to_short_string() - << " to " << expected_layout.to_short_string() << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string() + << " to " << expected_layout.to_short_string() << std::endl; GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); kernel = cache.get(layout_key); } else { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout.to_short_string() - << " to " << expected_layout.to_short_string() << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() + << " to " << expected_layout.to_short_string() << std::endl; auto& kernels_cache = get_network().get_kernels_cache(); auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false); kernels_cache.compile(); @@ -768,9 +716,7 @@ event::ptr primitive_inst::update_weights() { bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count(); if (can_reuse) { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": reuse weights memory" << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights memory" << std::endl; _impl_params->reordered_weights = engine.reinterpret_buffer(*_impl_params->reordered_weights, expected_layout); } else { auto alloc_type = engine.get_preferred_memory_allocation_type(); @@ -783,6 +729,7 @@ event::ptr primitive_inst::update_weights() { stream.set_arguments(*kernel, weights_params.clKernel->params, args); auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true); + GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { stream.wait_for_events({ev}); } @@ -851,15 +798,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false; auto use_lockable_memory = is_output_buffer(_node) || is_cpu || is_any_user_cpu(_node.get_users()) || !_engine.supports_allocation(allocation_type::usm_device); - GPU_DEBUG_GET_INSTANCE(debug_config); const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d()); const auto& alloc_type = use_lockable_memory ? lockable_mem_type : usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type; if ((is_internal && (_node.can_be_optimized() || _node.is_type())) || (memory_reuse_by_user == false)) { - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl; - } + GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return get_memory_from_pool(_engine, layout, _node.id(), @@ -868,21 +812,15 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, false); } else if (is_internal && _node.is_output() && _node.is_type() && _engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) { - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl; - } + GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return _engine.allocate_memory(layout, allocation_type::usm_device, false); } else if (is_internal && !_node.is_output() && _node.is_type()) { // Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive // or just reuse primitive's memory - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _node.id() << ": constant]" << std::endl; - } + GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl; return _engine.allocate_memory(layout, alloc_type, false); } else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) { - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl; - } + GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return _engine.allocate_memory(layout, alloc_type); } else { return get_memory_from_pool(_engine, @@ -940,10 +878,7 @@ std::string primitive_inst::generic_to_string(program_node const& node, const ch } cldnn::network::ptr primitive_inst::get_unfused_subgraph() { - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": Use unfused subgraph due to unexpected fusions\n"; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": Use unfused subgraph due to unexpected fusions\n"; if (!_unfused_subgraph) { topology t; diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 5d13b23bbef..8ac2ad8f073 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -719,17 +719,12 @@ void program::transfer_memory_to_device() { if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) { - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl; - } + GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl; // Allocate and transfer memory auto device_mem = mem.get_engine()->allocate_memory(data_node_layout, allocation_type::usm_device, false); device_mem->copy_from(get_stream(), mem); data_node.attach_memory(device_mem); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl; - } + GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl; const_cast(data_node.get_primitive()->mem).reset(); // TODO: Do we need finish call here? Maybe call it in network::execute() ? get_stream().finish(); @@ -1656,7 +1651,6 @@ std::pair program::get_estimated_device_mem_usage() { }); auto& engine = get_engine(); int64_t host_alloc = 0; - GPU_DEBUG_GET_INSTANCE(debug_config); // just to prevent the memories from being freed during allocation std::unordered_set allocated_mem_ptrs; for (const auto& node : nodes_to_allocate) { @@ -1676,10 +1670,8 @@ std::pair program::get_estimated_device_mem_usage() { if (engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu) total_host_alloc_size += engine.get_used_device_memory(allocation_type::usm_device); if ((cur_vmem != -1 && total_host_alloc_size > cur_vmem * 0.5) || (total_host_alloc_size >= max_global_mem_size)) { - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory (" - << cur_vmem << ")" << std::endl; - } + GPU_DEBUG_INFO << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory (" + << cur_vmem << ")" << std::endl; return {-1L, -1L}; } #endif diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index d658aa38f71..62a1b53f24f 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -246,23 +246,18 @@ bool program_node::is_detached(bool whole_branch) { } layout program_node::calc_output_layout() const { - GPU_DEBUG_GET_INSTANCE(debug_config); bool allow_new_shape_infer = get_program().get_options().get()->enabled(); if (allow_new_shape_infer) { auto out_layouts = type()->calc_output_layouts(*this, *get_kernel_impl_params()); if (!out_layouts.empty()) { - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl; return out_layouts[0]; } } auto res = type()->calc_output_layout(*this, *get_kernel_impl_params()); - GPU_DEBUG_IF(debug_config->verbose >= 4) { - GPU_DEBUG_COUT << id() << ": calc_output_layout:" << res << std::endl; - } + GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout:" << res << std::endl; return res; } @@ -528,8 +523,6 @@ bool program_node::has_out_scales(const std::shared_ptr& a dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr& attr, bool& optimization_is_completed) { - GPU_DEBUG_GET_INSTANCE(debug_config); - // Create new dnnl::post_ops object which will be filled inside the optimization process dnnl::post_ops optimized_p_ops; @@ -673,22 +666,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const int64_t prev_post_op_idx = 0; bool optimization_done = false; - GPU_DEBUG_IF(debug_config->verbose >= 3) { - GPU_DEBUG_COUT << "================================================" << std::endl; - GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl; - for (size_t i = 0; i < cur_post_ops.size(); i++) - GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl; - } + GPU_DEBUG_TRACE << "================================================" << std::endl; + GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl; + for (size_t i = 0; i < cur_post_ops.size(); i++) + GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl; remove_optimized_prefix(cur_post_ops); - GPU_DEBUG_IF(debug_config->verbose >= 3) { - GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl; - GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl; - for (size_t i = 0; i < cur_post_ops.size(); i++) - GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl; - GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl; - } + GPU_DEBUG_TRACE << "remove optimized prefix ------------------------" << std::endl; + GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl; + for (size_t i = 0; i < cur_post_ops.size(); i++) + GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl; + GPU_DEBUG_TRACE << "----------------------------------->>>>>>>>>>>>>" << std::endl; // Get post-ops size for current node int64_t post_ops_size = cur_post_ops.size(); @@ -710,8 +699,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const auto cur_type = cur_post_ops[cur_post_op_idx].op_type; auto prev_type = cur_post_ops[prev_post_op_idx].op_type; - GPU_DEBUG_IF(debug_config->verbose >= 3) - GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl; + GPU_DEBUG_TRACE << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl; // Ignore optimized operations for "previous" operation in our operation pair while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) { @@ -728,8 +716,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const cur_type = cur_post_ops[cur_post_op_idx].op_type; } - GPU_DEBUG_IF(debug_config->verbose >= 3) - GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl; + GPU_DEBUG_TRACE << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl; auto cur_idx = static_cast(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx); auto prev_idx = static_cast(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx); @@ -766,10 +753,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const bool cur_ops_pair_is_optimized = false; - GPU_DEBUG_IF(debug_config->verbose >= 3) { - GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type - << ", cur_idx: " << cur_idx << " " << cur_type << std::endl; - } + GPU_DEBUG_TRACE << "prev_idx: " << prev_idx << " " << prev_type + << ", cur_idx: " << cur_idx << " " << cur_type << std::endl; if (can_try_optimize) { if (eltw_and_eltw) { @@ -1005,12 +990,10 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const remove_optimized_prefix(cur_post_ops); } - GPU_DEBUG_IF(debug_config->verbose >= 3) { - GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl; - for (size_t i = 0; i < cur_post_ops.size(); i++) - GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl; - GPU_DEBUG_COUT << "------------------------------------------------" << std::endl; - } + GPU_DEBUG_TRACE << ">>>>>>>>>>>>>-----------------------------------" << std::endl; + for (size_t i = 0; i < cur_post_ops.size(); i++) + GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl; + GPU_DEBUG_TRACE << "------------------------------------------------" << std::endl; add_onednn_fused_primitives(cur_post_ops); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp index b86c91cc657..9e83418934a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector.cpp @@ -107,12 +107,8 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params, } } catch (std::runtime_error& ex) { // we have to handle it in order to avoid exception in KernelSelector as much we can - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 3) { - kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]"; - GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: " - << kernelName << " - " << ex.what() << std::endl; - } + kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]"; + GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl; } } @@ -202,12 +198,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params, } } catch (std::runtime_error& ex) { // we have to handle it in order to avoid exception in KernelSelector as much we can - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 3) { - kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]"; - GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: " - << kernelName << " - " << ex.what() << std::endl; - } + kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]"; + GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl; } } } @@ -231,12 +223,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params, } } catch (std::runtime_error& ex) { // we have to handle it in order to avoid exception in KernelSelector as much we can - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 3) { - kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]"; - GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: " - << kernelName << " - " << ex.what() << std::endl; - } + kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]"; + GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl; } } } diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index 713b325d6fc..84c3c5660b8 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -493,7 +493,7 @@ void InferRequest::enqueue() { // If dump layers path is set, only runs first inference. GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { - GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl; + GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl; exit(0); } } @@ -760,10 +760,7 @@ void InferRequest::allocate_inputs() { OPENVINO_ASSERT(litr != inputLayouts.end(), "[GPU] Input layout for ", name, " is not found"); const auto input_layout = litr->second; - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl; - } + GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl; if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { TensorDesc desc_fp32 = desc; desc_fp32.setPrecision(Precision::FP32); @@ -796,10 +793,7 @@ void InferRequest::allocate_outputs() { if (output_layout.is_static()) desc.setDims(m_graph->GetOutputSize(no.first)); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl; - } + GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl; outputsMap[no.first] = outputID; if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 || diff --git a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp index 58bd488c2af..9d1cd0a2283 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp @@ -769,7 +769,7 @@ void InferRequestLegacy::enqueue() { // If dump layers path is set, only runs first inference. GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { - GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl; + GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl; exit(0); } } @@ -1031,10 +1031,7 @@ void InferRequestLegacy::allocate_inputs() { IE_THROW() << "Input layout for " << name << " is not found"; } - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl; - } + GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl; if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) { TensorDesc desc_fp32 = desc; desc_fp32.setPrecision(Precision::FP32); @@ -1088,10 +1085,7 @@ void InferRequestLegacy::allocate_outputs() { // Can be removed once 76176 is resolved. desc.setDims(m_graph->GetOutputSize(no.first)); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl; - } + GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl; outputsMap[no.first] = outputID; if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 || diff --git a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp index 4d05d455743..b023c191c9e 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp @@ -201,10 +201,7 @@ void createClDnnConstant(Program& p, const ngraph::Shape& constDims, const std:: p.primitive_ids[initialconstPrimID] = constPrimID; p.profiling_ids.push_back(initialconstPrimID); } else { - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << initialconstPrimID << ": constant]" << std::endl; - } + GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl; cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false); auto& stream = p.GetEngine().get_program_stream(); cldnn::mem_lock lock{mem, stream}; diff --git a/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp b/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp index d0581a2970b..21fc053f7e4 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/ctc_greedy_decoder.cpp @@ -73,10 +73,7 @@ static void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptrget_output_shape(1).size()), tensor_from_dims(op->get_output_shape(1))); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; - } + GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout)); cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write"; diff --git a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp index 7dd7ca8cecb..c583254513d 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp @@ -93,7 +93,6 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt auto outputIndices = op->get_output_partial_shape(0)[0].get_length(); std::vector shared_memory; - GPU_DEBUG_GET_INSTANCE(debug_config); switch (num_outputs) { case 3: { auto mutable_precision_second = op->get_output_element_type(2); @@ -105,9 +104,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt cldnn::format::get_default_format(op->get_output_shape(2).size()), tensor_from_dims(op->get_output_shape(2))); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; - } + GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond)); cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second"; @@ -123,9 +120,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt cldnn::format::bfyx, cldnn::tensor(static_cast(outputIndices), 3, 1, 1)); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; - } + GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst)); cldnn::primitive_id non_max_supression_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first"; diff --git a/src/plugins/intel_gpu/src/plugin/ops/proposal.cpp b/src/plugins/intel_gpu/src/plugin/ops/proposal.cpp index 99d28e412f6..93bade2040a 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/proposal.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/proposal.cpp @@ -64,10 +64,7 @@ static void CreateProposalOp(Program& p, const std::shared_ptrget_output_shape(1).size()), tensor_from_dims(op->get_output_shape(1))); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; - } + GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; auto shared_memory = p.GetEngine().allocate_memory(mutableLayout); cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write"; diff --git a/src/plugins/intel_gpu/src/plugin/ops/topk.cpp b/src/plugins/intel_gpu/src/plugin/ops/topk.cpp index 618013160d7..f7132c83007 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/topk.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/topk.cpp @@ -65,10 +65,7 @@ static void CreateTopKOp(Program& p, const std::shared_ptr cldnn::format::get_default_format(op->get_output_shape(1).size()), tensor_from_dims(op->get_output_shape(1))); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; - } + GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl; auto shared_memory = p.GetEngine().allocate_memory(mutableLayout); cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write"; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 84b842e99a9..cb7fa99fa8a 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -686,25 +686,20 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 1) { - GPU_DEBUG_COUT << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl; - } + GPU_DEBUG_INFO << "[OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl; return decltype(ov::optimal_batch_size)::value_type {static_cast(1)}; } std::shared_ptr model; try { model = model_param->second.as>(); } catch (...) { - IE_THROW() << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr type"; - } - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "DEVICE_INFO:" - << "gfx_version.major, " << device_info.gfx_ver.major - << "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl; + IE_THROW() << "[OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr type"; } + GPU_DEBUG_INFO << "DEVICE_INFO:" + << "gfx_version.major, " << device_info.gfx_ver.major + << "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl; static std::map gen_kbytes_per_bank = { {{12, 0, 0}, 480}, // TGL {{12, 1, 0}, 2048}, // DG1 @@ -722,14 +717,12 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 1) { - GPU_DEBUG_COUT << "DEVICE_INFO:" - << "num_slices " << device_info.num_slices - << ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice - << ", num_banks_per_slice " << num_banks_per_slice - << ", gen_kbytes_per_bank : " << kbytes_per_bank - << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl; - } + GPU_DEBUG_INFO << "DEVICE_INFO:" + << "num_slices " << device_info.num_slices + << ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice + << ", num_banks_per_slice " << num_banks_per_slice + << ", gen_kbytes_per_bank : " << kbytes_per_bank + << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl; } Config config = _impl->m_configs.GetConfig(device_id); auto networkCloned = CloneAndTransformNetwork(CNNNetwork(model), config); @@ -744,11 +737,9 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 1) { - GPU_DEBUG_COUT << memPressure.max_mem_tolerance << std::endl; - GPU_DEBUG_COUT << "MAX_BATCH: " << max_batch_size << std::endl; - GPU_DEBUG_COUT << "ACTUAL OPTIMAL BATCH: " << batch << std::endl; - } + GPU_DEBUG_INFO << memPressure.max_mem_tolerance << std::endl; + GPU_DEBUG_INFO << "MAX_BATCH: " << max_batch_size << std::endl; + GPU_DEBUG_INFO << "ACTUAL OPTIMAL BATCH: " << batch << std::endl; return decltype(ov::optimal_batch_size)::value_type {batch}; } else if (name == ov::device::uuid) { ov::device::UUID uuid = {}; @@ -815,17 +806,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 2) { - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem - << " (occupied: " << occupied_device_mem << ")" << std::endl; - } + GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem + << " (occupied: " << occupied_device_mem << ")" << std::endl; int64_t max_batch_size = 1; if (options.find(ov::hint::model.name()) == options.end()) { - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl; - } + GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl; return decltype(ov::max_batch_size)::value_type {static_cast(max_batch_size)}; } @@ -849,17 +836,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 2) { - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl; - } + GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl; auto available_device_mem_it = options.find(ov::intel_gpu::hint::available_device_mem.name()); if (available_device_mem_it != options.end()) { if (available_device_mem_it->second.is()) { available_device_mem = std::min(static_cast(available_device_mem), available_device_mem_it->second.as()); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl; - } + GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl; } else { IE_THROW() << "[GPU_MAX_BATCH_SIZE] bad casting: ov::intel_gpu::hint::available_device_mem should be int64_t type"; } @@ -910,9 +893,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapget_partial_shape(); // currently no plugin support batched execution for dynamic networks if (shape.is_dynamic()) { - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl; - } + GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl; return decltype(ov::max_batch_size)::value_type {static_cast(max_batch_size)}; } @@ -921,10 +902,8 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapoutput(0)); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id - << "[" << s << "]" << std::endl; - } + GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id + << "[" << s << "]" << std::endl; batched_inputs.insert(std::make_pair(batched_input_id, s)); } } @@ -932,9 +911,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 2) { - GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl; - } + GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl; return decltype(ov::max_batch_size)::value_type {static_cast(max_batch_size)}; } @@ -944,9 +921,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapverbose >= 1) { - GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl; - } + GPU_DEBUG_INFO << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl; return decltype(ov::max_batch_size)::value_type {static_cast(max_batch_size)}; } @@ -962,15 +937,11 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map(static_cast(available_device_mem) - device_memory_usage.first)); int64_t mem_per_batch = std::max(static_cast(1L), (device_memory_usage.second / static_cast(base_batch_size))); max_batch_size = mem_for_general / (mem_per_batch * static_cast(n_streams)); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size << std::endl; - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first << std::endl; - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second << std::endl; - } + GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size << std::endl; + GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first << std::endl; + GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second << std::endl; } catch (std::exception& e) { - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl; - } + GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl; } return decltype(ov::max_batch_size)::value_type {static_cast(max_batch_size)}; } else if (isModelCachingEnabled && name == METRIC_KEY(IMPORT_EXPORT_SUPPORT)) { diff --git a/src/plugins/intel_gpu/src/plugin/program.cpp b/src/plugins/intel_gpu/src/plugin/program.cpp index 930c8bf6092..a726dc10db1 100644 --- a/src/plugins/intel_gpu/src/plugin/program.cpp +++ b/src/plugins/intel_gpu/src/plugin/program.cpp @@ -375,11 +375,8 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr& op) { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::CreateSingleLayerPrimitive"); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation " - << "(friendly_name=" << op->get_friendly_name() << ")" << std::endl; - } + GPU_DEBUG_LOG << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation " + << "(friendly_name=" << op->get_friendly_name() << ")" << std::endl; bool is_created = false; const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info(); diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index d50f090327e..abb45e877b2 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -17,6 +17,8 @@ const char *debug_configuration::prefix = "GPU_Debug: "; #ifdef GPU_DEBUG_CONFIG +#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix + template void print_option(std::string option_name, T option_value) { GPU_DEBUG_COUT << "Config " << option_name << " = " << option_value << std::endl; diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index f3e7304bd1b..d66bedbab82 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -265,11 +265,8 @@ std::shared_ptr engine::create(engine_types engine_type, default: throw std::runtime_error("Invalid engine type"); } - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - const auto& info = device->get_info(); - GPU_DEBUG_COUT << "Selected Device: " << info.dev_name << std::endl; - } + const auto& info = device->get_info(); + GPU_DEBUG_INFO << "Selected Device: " << info.dev_name << std::endl; return ret; } diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index 4c630ee07ba..12cabd542f4 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -299,12 +299,9 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program& dump_file << "*/\n"; } if (!err_log.empty()) { - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose) { - std::cout << "-------- OpenCL build error" << std::endl; - std::cout << err_log << std::endl; - std::cout << "-------- End of OpenCL build error" << std::endl; - } + GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl; + GPU_DEBUG_INFO << err_log << std::endl; + GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl; std::stringstream err_ss(err_log); std::string line; int cnt = 0; diff --git a/src/plugins/intel_gpu/src/runtime/memory.cpp b/src/plugins/intel_gpu/src/runtime/memory.cpp index 63511ce9228..e097946f204 100644 --- a/src/plugins/intel_gpu/src/runtime/memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/memory.cpp @@ -21,12 +21,9 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, bool : _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), _type(type), _reused(reused) { if (!_reused && _engine) { _engine->add_memory_used(_bytes_count, type); - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "Allocate " << _bytes_count << " bytes of " << type << " allocation type" - << " (current=" << _engine->get_used_device_memory(type) << ";" - << " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl; - } + GPU_DEBUG_LOG << "Allocate " << _bytes_count << " bytes of " << type << " allocation type" + << " (current=" << _engine->get_used_device_memory(type) << ";" + << " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl; } } @@ -35,12 +32,9 @@ memory::~memory() { try { _engine->subtract_memory_used(_bytes_count, _type); } catch (...) {} - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "Free " << _bytes_count << " bytes of " << _type << " allocation type" - << " (current=" << _engine->get_used_device_memory(_type) << ";" - << " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl; - } + GPU_DEBUG_LOG << "Free " << _bytes_count << " bytes of " << _type << " allocation type" + << " (current=" << _engine->get_used_device_memory(_type) << ";" + << " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl; } } diff --git a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp index 5bf970f22c7..04bb08888f1 100644 --- a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp +++ b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp @@ -137,10 +137,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout, ++it; } } - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl; - } + GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl; // didn't find anything for you? create new resource auto mem = alloc_memory(layout, type); { @@ -179,10 +176,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout, memory_record({{id, network_id}}, mem, network_id, type)); return mem; } - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl; - } + GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl; auto mem = alloc_memory(layout, type); std::list list = {memory_record({{id, network_id}}, mem, network_id, type)}; _padded_pool.emplace(layout, std::move(list)); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_command_queues_builder.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_command_queues_builder.cpp index d2dca452ca4..a501e935e7c 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_command_queues_builder.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_command_queues_builder.cpp @@ -76,10 +76,7 @@ std::vector command_queues_builder::get_properties(const cl using cmp_t = std::common_type::type>::type; if (!(static_cast(queue_properties) & static_cast(cl::QueueProperties::OutOfOrder))) { out_of_order = false; - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 1) { - GPU_DEBUG_COUT << "Requested out-of-order queue is not supported by current device. Use in-order instead"; - } + GPU_DEBUG_INFO << "Requested out-of-order queue is not supported by current device. Use in-order instead\n"; } } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp index 872a715838f..52e1b1b5c42 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_event.cpp @@ -180,10 +180,10 @@ bool ocl_events::get_profiling_info_impl(std::listprint_multi_kernel_perf) { if (period.stage == instrumentation::profiling_stage::executing) { - GPU_DEBUG_COUT << "Multi-kernel time: "; + GPU_DEBUG_TRACE << "Multi-kernel time: "; for (auto& duration : all_durations[period.stage]) - std::cout << " " << (duration.second - duration.first) / 1000; - std::cout << " Total " << sum / 1000 << std::endl; + GPU_DEBUG_TRACE << " " << (duration.second - duration.first) / 1000; + GPU_DEBUG_TRACE << " Total " << sum / 1000 << std::endl; } } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index 0963e1b5f4f..7238ec822dc 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -338,10 +338,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type) { if (type != mem_lock_type::read) { throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type."); } - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->verbose >= 2) { - GPU_DEBUG_COUT << "Copy usm_device buffer to host buffer." << std::endl; - } + GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl; _host_buffer.allocateHost(_bytes_count); cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, CL_TRUE); _mapped_ptr = _host_buffer.get(); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp index 78fd6a28894..de03847b82f 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp @@ -6,6 +6,7 @@ #include "ocl_event.hpp" #include "ocl_user_event.hpp" #include "ocl_command_queues_builder.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" #include "ocl_kernel.hpp" #include "ocl_common.hpp" @@ -49,6 +50,27 @@ inline cl::NDRange toNDRange(const std::vector& v) { } } +cl_int set_kernel_arg(ocl_kernel_type& kernel, uint32_t idx, cldnn::memory::cptr mem) { + if (!mem) + return CL_INVALID_ARG_VALUE; + + if (mem->get_layout().format.is_image_2d()) { + auto buf = std::dynamic_pointer_cast(mem)->get_buffer(); + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (image) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl; + return kernel.setArg(idx, buf); + } else if (memory_capabilities::is_usm_type(mem->get_allocation_type())) { + auto buf = std::dynamic_pointer_cast(mem)->get_buffer(); + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (usm) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl; + return kernel.setArgUsm(idx, buf); + } else { + auto buf = std::dynamic_pointer_cast(mem)->get_buffer(); + GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (buffer) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl; + return kernel.setArg(idx, buf); + } + + return CL_INVALID_ARG_VALUE; +} + void set_arguments_impl(ocl_kernel_type& kernel, const arguments_desc& args, const kernel_arguments_data& data) { @@ -59,121 +81,44 @@ void set_arguments_impl(ocl_kernel_type& kernel, switch (args[i].t) { case args_t::INPUT: if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) { - const auto& input_mem = data.inputs[args[i].index]; - if (input_mem) { - if (input_mem->get_layout().format.is_image_2d()) - status = kernel.setArg(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - else if (memory_capabilities::is_usm_type(input_mem->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.inputs[args[i].index]); } break; case args_t::INPUT_OF_FUSED_PRIMITIVE: if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) { - const auto& input_mem = data.fused_op_inputs[args[i].index]; - if (input_mem) { - if (memory_capabilities::is_usm_type(input_mem->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]); } break; case args_t::INTERNAL_BUFFER: if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) { - const auto& input_mem = data.intermediates[args[i].index]; - if (input_mem) { - if (memory_capabilities::is_usm_type(input_mem->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(input_mem)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]); } break; case args_t::OUTPUT: if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) { - const auto& output_mem = data.outputs[args[i].index]; - if (output_mem) { - if (output_mem->get_layout().format.is_image_2d()) - status = kernel.setArg(i, std::dynamic_pointer_cast(output_mem)->get_buffer()); - else if (memory_capabilities::is_usm_type(output_mem->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(output_mem)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(output_mem)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.outputs[args[i].index]); } break; case args_t::WEIGHTS: - if (data.weights) { - if (data.weights->get_layout().format.is_image_2d()) - status = kernel.setArg(i, std::dynamic_pointer_cast(data.weights)->get_buffer()); - else if (memory_capabilities::is_usm_type(data.weights->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(data.weights)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(data.weights)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.weights); break; case args_t::BIAS: - if (data.bias) { - if (memory_capabilities::is_usm_type(data.bias->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(data.bias)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(data.bias)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.bias); break; case args_t::WEIGHTS_ZERO_POINTS: - if (data.weights_zero_points) { - if (memory_capabilities::is_usm_type(data.weights_zero_points->get_allocation_type())) - status = kernel.setArgUsm( - i, - std::dynamic_pointer_cast(data.weights_zero_points)->get_buffer()); - else - status = kernel.setArg( - i, - std::dynamic_pointer_cast(data.weights_zero_points)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.weights_zero_points); break; case args_t::ACTIVATIONS_ZERO_POINTS: - if (data.activations_zero_points) { - if (memory_capabilities::is_usm_type(data.activations_zero_points->get_allocation_type())) - status = kernel.setArgUsm( - i, - std::dynamic_pointer_cast(data.activations_zero_points)->get_buffer()); - else - status = kernel.setArg( - i, - std::dynamic_pointer_cast(data.activations_zero_points)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.activations_zero_points); break; case args_t::COMPENSATION: - if (data.compensation) { - if (memory_capabilities::is_usm_type(data.compensation->get_allocation_type())) - status = kernel.setArgUsm( - i, - std::dynamic_pointer_cast(data.compensation)->get_buffer()); - else - status = kernel.setArg( - i, - std::dynamic_pointer_cast(data.compensation)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.compensation); break; case args_t::SCALE_TABLE: - if (data.scale_table) { - if (memory_capabilities::is_usm_type(data.scale_table->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(data.scale_table)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(data.scale_table)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.scale_table); break; case args_t::SLOPE: - if (data.slope) { - if (memory_capabilities::is_usm_type(data.slope->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(data.slope)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(data.slope)->get_buffer()); - } + status = set_kernel_arg(kernel, i, data.slope); break; case args_t::SPLIT: status = kernel.setArg(i, data.split); @@ -217,48 +162,17 @@ void set_arguments_impl(ocl_kernel_type& kernel, } } break; - case args_t::RECURRENT: // RNN/LSTM/GRU layers - if (data.recurrent) { - if (data.recurrent->get_layout().format.is_image_2d()) - status = kernel.setArg(i, dynamic_cast(*data.recurrent).get_buffer()); - else if (memory_capabilities::is_usm_type(data.recurrent->get_allocation_type())) - status = kernel.setArgUsm(i, dynamic_cast(*data.recurrent).get_buffer()); - else - status = kernel.setArg(i, dynamic_cast(*data.recurrent).get_buffer()); - } + case args_t::RECURRENT: + status = set_kernel_arg(kernel, i, data.recurrent); break; - case args_t::HIDDEN: // RNN/LSTM/GRU layers - if (data.hidden) { - if (data.hidden->get_layout().format.is_image_2d()) - status = kernel.setArg(i, dynamic_cast(*data.hidden).get_buffer()); - else if (memory_capabilities::is_usm_type(data.hidden->get_allocation_type())) - status = kernel.setArgUsm(i, dynamic_cast(*data.hidden).get_buffer()); - else - status = kernel.setArg(i, dynamic_cast(*data.hidden).get_buffer()); - } + case args_t::HIDDEN: + status = set_kernel_arg(kernel, i, data.hidden); break; - case args_t::CELL: // LSTMlayers - if (data.cell) { - if (data.cell->get_layout().format.is_image_2d()) - status = kernel.setArg(i, dynamic_cast(*data.cell).get_buffer()); - else if (memory_capabilities::is_usm_type(data.cell->get_allocation_type())) - status = kernel.setArgUsm(i, dynamic_cast(*data.cell).get_buffer()); - else - status = kernel.setArg(i, dynamic_cast(*data.cell).get_buffer()); - } + case args_t::CELL: + status = set_kernel_arg(kernel, i, data.cell); break; case args_t::SHAPE_INFO: - if (args[i].index == 0 && data.shape_info) { - const auto& shape_info_mem = data.shape_info; - if (shape_info_mem) { - if (shape_info_mem->get_layout().format.is_image_2d()) - status = kernel.setArg(i, std::dynamic_pointer_cast(shape_info_mem)->get_buffer()); - else if (memory_capabilities::is_usm_type(shape_info_mem->get_allocation_type())) - status = kernel.setArgUsm(i, std::dynamic_pointer_cast(shape_info_mem)->get_buffer()); - else - status = kernel.setArg(i, std::dynamic_pointer_cast(shape_info_mem)->get_buffer()); - } - } + status = set_kernel_arg(kernel, i, data.shape_info); break; default: break; @@ -359,6 +273,7 @@ void ocl_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args auto& kern = ocl_kernel.get_handle(); try { + GPU_DEBUG_TRACE_DETAIL << "Set arguments for primitive: " << args_desc.layerID << " (" << kern.get() << ")\n"; set_arguments_impl(kern, args_desc.arguments, args); } catch (cl::Error const& err) { throw ocl_error(err);