[GPU] Refactored GPU verbose utils (#14549)

* [GPU] Refactored GPU verbose utils

* [GPU] More refacoring

* Fix build
This commit is contained in:
Vladimir Paramuzov 2022-12-20 11:06:01 +04:00 committed by GitHub
parent e306cbc67a
commit 6bc4f00695
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 266 additions and 536 deletions

View File

@ -7,22 +7,63 @@
#include <mutex>
#include <vector>
namespace ov {
namespace intel_gpu {
// Verbose log levels:
// DISABLED - silent mode (Default)
// INFO - Minimal verbose:
// * May log basic info about device, plugin configuration, model and execution statistics
// * Mustn't log any info that depend on neither number of iterations or number of layers in the model
// * Minimal impact on both load time and inference time
// LOG - Enables graph optimization verbose:
// * Includes info from log level INFO
// * May log info about applied graph transformations, memory allocations and other model compilation time steps
// * May impact compile_model() execution time
// * Minimal impact on inference time
// TRACE - Enables basic execution time verbose
// * Includes info from log level LOG
// * May log info during model execution
// * May log short info about primitive execution
// * May impact network execution time
// TRACE_DETAIL - Max verbosity
// * Includes info from log level TRACE
// * May log any stage and print detailed info about each execution step
enum class LogLevel : int8_t {
DISABLED = 0,
INFO = 1,
LOG = 2,
TRACE = 3,
TRACE_DETAIL = 4
};
} // namespace intel_gpu
} // namespace ov
#ifdef GPU_DEBUG_CONFIG
#define GPU_DEBUG_IF(cond) if (cond)
#define GPU_DEBUG_PROFILED_STAGE(stage) \
auto stage_prof = cldnn::instrumentation::profiled_stage<primitive_inst>(\
!cldnn::debug_configuration::get_instance()->dump_profiling_data.empty(), *this, stage)
#define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) stage_prof.set_cache_hit(val)
#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
std::cout << cldnn::debug_configuration::prefix
#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
#else
#define GPU_DEBUG_IF(cond) if (0)
#define GPU_DEBUG_PROFILED_STAGE(stage)
#define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val)
#define GPU_DEBUG_LOG_RAW(min_verbose_level) if (0) std::cout << cldnn::debug_configuration::prefix
#endif
#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix
// Macro below is inserted to avoid unused variable warning when GPU_DEBUG_CONFIG is OFF
#define GPU_DEBUG_GET_INSTANCE(name) auto name = cldnn::debug_configuration::get_instance(); (void)(name);
#define GPU_DEBUG_INFO GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::INFO)
#define GPU_DEBUG_LOG GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::LOG)
#define GPU_DEBUG_TRACE GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE)
#define GPU_DEBUG_TRACE_DETAIL GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE_DETAIL)
namespace cldnn {

View File

@ -22,10 +22,8 @@
using namespace cldnn;
#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \
GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; }
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \
GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Replace node: " << (id) << std::endl; }
#define LOG_NODE_REMOVAL(id) GPU_DEBUG_LOG_PASS << "Remove node: " << (id) << std::endl;
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_LOG_PASS << "Replace node: " << (id) << std::endl;
remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
@ -34,7 +32,6 @@ remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, b
remove_output_reorders(remove_output_reorders) {}
void remove_redundant_reorders::run(program& p) {
GPU_DEBUG_GET_INSTANCE(debug_config);
auto update_implementation = [&](program_node& node) {
if (!update_implementations)
return;

View File

@ -34,8 +34,6 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
namespace {
std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
GPU_DEBUG_GET_INSTANCE(debug_config);
std::map<program_node*, format::type> fmt_map;
#ifdef ENABLE_ONEDNN_FOR_GPU
@ -59,9 +57,7 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
if (onednn_impls_counter < 1 && lo.get_optimization_attributes().use_onednn_impls) {
should_update_fmt_map = true;
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
}
GPU_DEBUG_LOG << "Disable oneDNN implementations globally" << std::endl;
}
if (should_update_fmt_map)
@ -277,10 +273,7 @@ void propagate_formats_rec(std::map<program_node*, format::type>& fmt_map,
fmt = travel_direction_wrapper<dir>::first(first_fmt, second_fmt);
fmt_map.at(node) = fmt;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
}
GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
for (auto next : travel_direction_wrapper<dir>::next_nodes(node)) {
if (!next->is_in_data_flow())
@ -312,10 +305,7 @@ void propagate_formats_rec<direction_e::backwards>(std::map<program_node*, forma
fmt = travel_direction_wrapper<direction_e::backwards>::first(first_fmt, second_fmt);
fmt_map.at(node) = fmt;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
}
GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
for (auto next : travel_direction_wrapper<direction_e::backwards>::next_nodes(node)) {
if (!next.first->is_in_data_flow())
@ -541,7 +531,6 @@ const char *dir_msg(direction_e dir) {
// If there is layout mismatch between two layers, add reorder
template <direction_e dir>
void insert_reorders_in_dir(program& p, const std::map<program_node*, format::type>& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) {
GPU_DEBUG_GET_INSTANCE(debug_config);
auto fmt = fmt_map.at(node);
auto next_cpy = travel_direction_wrapper<dir>::next_nodes(node);
@ -565,10 +554,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## "
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
}
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## "
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
if (in_layout.format == format::any || out_layout.format == format::any)
continue;
@ -580,10 +567,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
auto reorder = reorder_pair.first;
if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
auto& reorder_node = p.get_or_create(reorder);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id()
<< " Reorder is added" << std::endl;
}
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id()
<< " Reorder is added" << std::endl;
p.add_intermediate(reorder_node,
*travel_direction_wrapper<dir>::second(node, next),
*travel_direction_wrapper<dir>::first(node, next),
@ -595,7 +580,6 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
template <>
void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<program_node*, format::type>& fmt_map,
reorder_factory& rf, layout_optimizer& lo, program_node* node) {
GPU_DEBUG_GET_INSTANCE(debug_config);
auto fmt = fmt_map.at(node);
auto next_cpy = travel_direction_wrapper<direction_e::backwards>::next_nodes(node);
@ -619,11 +603,9 @@ void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<p
in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards)
<< " " << node->id() << " --> " << next.first->id() << " ## "
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
}
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards)
<< " " << node->id() << " --> " << next.first->id() << " ## "
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
if (in_layout.format == format::any || out_layout.format == format::any)
continue;
@ -635,10 +617,8 @@ void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<p
auto reorder = reorder_pair.first;
if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
auto& reorder_node = p.get_or_create(reorder);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << " " << reorder_node.id()
<< " Reorder is added" << std::endl;
}
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << " " << reorder_node.id()
<< " Reorder is added" << std::endl;
p.add_intermediate(reorder_node,
*travel_direction_wrapper<direction_e::backwards>::second(node, next.first),
*travel_direction_wrapper<direction_e::backwards>::first(node, next.first),
@ -684,30 +664,26 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
auto fmt_map = get_preferred_formats(p, lo);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
for (auto& node_fmt : fmt_map) {
if (node_fmt.second != format::any) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
}
GPU_DEBUG_LOG_PASS << "Preferred formats:" << std::endl;
for (auto& node_fmt : fmt_map) {
if (node_fmt.second != format::any) {
GPU_DEBUG_LOG_PASS << " " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
}
}
propagate_formats(p, fmt_map, lo);
minimize_local_reorders(p, fmt_map, lo);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
for (auto node_ptr : p.get_processing_order()) {
if (fmt_map.count(node_ptr) == 0)
continue;
GPU_DEBUG_LOG_PASS << "Selected formats:" << std::endl;
for (auto node_ptr : p.get_processing_order()) {
if (fmt_map.count(node_ptr) == 0)
continue;
auto fmt = fmt_map.at(node_ptr);
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
}
auto fmt = fmt_map.at(node_ptr);
GPU_DEBUG_LOG_PASS << " " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
}
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_IF(debug_config->verbose >= 2) {
reorder_cnt total_reorder_count = std::accumulate(
p.get_processing_order().begin(),
p.get_processing_order().end(),
@ -719,8 +695,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
});
// Divide results by two as above function will each reorder from both sides
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
GPU_DEBUG_LOG_PASS << "Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
GPU_DEBUG_LOG_PASS << "Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
// Count number of reorders that will be fused
size_t nodes_with_fusing = 0;
@ -736,8 +712,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
}
}
}
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
GPU_DEBUG_LOG_PASS << "Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
GPU_DEBUG_LOG_PASS << "----------------------------------------------" << std::endl;
}
insert_reorders(p, fmt_map, rf, lo);
@ -802,7 +778,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
}
};
const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node<convolution>& conv_node) {
const auto reorder_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
{
// reorder weights convolution
auto& weights = conv_node.weights();

View File

@ -32,7 +32,6 @@ void select_preferred_formats::run(program& p) {
return;
#ifdef ENABLE_ONEDNN_FOR_GPU
GPU_DEBUG_GET_INSTANCE(debug_config);
for (auto n : p.get_processing_order()) {
// Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
try {
@ -47,9 +46,7 @@ void select_preferred_formats::run(program& p) {
_lo.select_preferred_formats_for_onednn(*n, prim_desc);
} catch(std::exception &exception) {
GPU_DEBUG_IF(debug_config->verbose >= 1) {
std::cout << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
}
GPU_DEBUG_INFO << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
}
}
#endif // ENABLE_ONEDNN_FOR_GPU

View File

@ -68,18 +68,15 @@ protected:
dnnl::memory::desc desc = onednn::layout_to_memory_desc(a_zp->get_layout(), dnnl::memory::format_tag::a, true);
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, a_zp->get_onednn_memory(desc)});
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
auto dnnl_mem = a_zp->get_onednn_memory(desc);
void *mapped_ptr = dnnl_mem.map_data();
if (mapped_ptr) {
GPU_DEBUG_COUT << instance.id() << " activations_zero_points: ";
for (size_t i = 0; i < desc.get_size(); ++i) {
std::cout << static_cast<int32_t*>(mapped_ptr)[i] << " ";
}
std::cout << std::endl;
dnnl_mem.unmap_data(mapped_ptr);
auto dnnl_mem = a_zp->get_onednn_memory(desc);
void *mapped_ptr = dnnl_mem.map_data();
if (mapped_ptr) {
GPU_DEBUG_TRACE_DETAIL << instance.id() << " activations_zero_points: ";
for (size_t i = 0; i < desc.get_size(); ++i) {
GPU_DEBUG_TRACE_DETAIL << static_cast<int32_t*>(mapped_ptr)[i] << " ";
}
GPU_DEBUG_TRACE_DETAIL << std::endl;
dnnl_mem.unmap_data(mapped_ptr);
}
}
@ -88,18 +85,15 @@ protected:
dnnl::memory::desc desc = onednn::layout_to_memory_desc(w_zp->get_layout(), dnnl::memory::format_tag::a, true);
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, w_zp->get_onednn_memory(desc)});
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
auto dnnl_mem = w_zp->get_onednn_memory(desc);
void *mapped_ptr = dnnl_mem.map_data();
if (mapped_ptr) {
GPU_DEBUG_COUT << instance.id() << " weights_zero_points: ";
for (size_t i = 0; i < desc.get_size(); ++i) {
std::cout << static_cast<int32_t*>(mapped_ptr)[i] << " ";
}
std::cout << std::endl;
dnnl_mem.unmap_data(mapped_ptr);
auto dnnl_mem = w_zp->get_onednn_memory(desc);
void *mapped_ptr = dnnl_mem.map_data();
if (mapped_ptr) {
GPU_DEBUG_TRACE_DETAIL << instance.id() << " weights_zero_points: ";
for (size_t i = 0; i < desc.get_size(); ++i) {
GPU_DEBUG_TRACE_DETAIL << static_cast<int32_t*>(mapped_ptr)[i] << " ";
}
GPU_DEBUG_TRACE_DETAIL << std::endl;
dnnl_mem.unmap_data(mapped_ptr);
}
}

View File

@ -23,6 +23,8 @@
#include <fstream>
#define GPU_DEBUG_LOG_PASS GPU_DEBUG_LOG << "[" << get_name() << "] "
namespace cldnn {
class base_pass {
friend class pass_manager;

View File

@ -1350,8 +1350,8 @@ impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node)
preferred_type = impl_types::cpu;
if (node.id() == forced_impl_type.substr(0, found_type)) {
GPU_DEBUG_COUT << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
<< forced_impl_type.substr(found_type + 1) << std::endl;
GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
<< forced_impl_type.substr(found_type + 1) << std::endl;
return preferred_type;
}
}
@ -1819,7 +1819,6 @@ format layout_optimizer::get_preferred_format(program_node& node) {
#ifdef ENABLE_ONEDNN_FOR_GPU
void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc) {
GPU_DEBUG_GET_INSTANCE(debug_config);
if (node.is_input() || !are_data_types_suitable_for_onednn(node)) {
return;
}
@ -1868,10 +1867,8 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
node.set_preferred_output_fmt(usr, dst_fmt);
}
GPU_DEBUG_IF(debug_config->verbose >= 2) {
std::cout << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
<< " For index : " << idx << std::endl;
}
GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
<< " For index : " << idx << std::endl;
}
}

View File

@ -890,9 +890,8 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "NetworkImpl::Execute");
// Wait for previous execution completion
reset_execution(false);
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 1)
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl;
GPU_DEBUG_TRACE << "Start network execution" << std::endl;
std::vector<memory::ptr> in_out_mem;
auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
@ -928,6 +927,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());
set_arguments();
GPU_DEBUG_GET_INSTANCE(debug_config);
for (auto& inst : _exec_order) {
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
const std::string layer_name = inst->id();
@ -1120,10 +1120,7 @@ void network::allocate_primitive_instance(program_node const& node) {
if (_primitives.count(node.id()))
return;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << node.id() << ": allocate primitive instance" << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << node.id() << ": allocate primitive instance" << std::endl;
auto inst = node.type()->create_instance(*this, node);
@ -1191,10 +1188,7 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
// Allocate and transfer memory
auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
device_mem->copy_from(get_stream(), inst_mem);
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << node.id() << ": constant]" << std::endl;
}
GPU_DEBUG_LOG << "[" << node.id() << ": constant]" << std::endl;
_memory_pool->release_memory(&inst_mem, node.id(), get_id());
instance->set_output_memory(device_mem);
}

View File

@ -48,11 +48,13 @@ void pass_manager::run(program& p, base_pass& pass) {
using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
using Time = std::chrono::high_resolution_clock;
GPU_DEBUG_LOG << "Run pass " << pass.get_name() << std::endl;
auto start = Time::now();
pass.run(p);
auto stop = Time::now();
std::chrono::duration<float> fs = stop - start;
ms opt_pass_time = std::chrono::duration_cast<ms>(fs);
GPU_DEBUG_LOG << "Pass " << pass.get_name() << " execution time: " << opt_pass_time.count() << " ms" << std::endl;
p.save_pass_info(pass.get_name());

View File

@ -142,7 +142,6 @@ void primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t i
}
void primitive_inst::update_shape() {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
bool input_shape_changed = false;
@ -199,9 +198,7 @@ void primitive_inst::update_shape() {
// Events may be not created for in-order queue, so take them for OOO queue only
if (_network.has_event(dep.id()) && queue_type == queue_types::out_of_order) {
dependencies_events.push_back(_network.get_primitive_event(dep_id));
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": shape infer waits for " << i << " dependency\n";
}
GPU_DEBUG_TRACE_DETAIL << id() << ": shape infer waits for " << i << " dependency\n";
}
auto dep_mem = _network.get_output_memory(dep_id);
memory_deps.insert({i, dep_mem});
@ -221,10 +218,8 @@ void primitive_inst::update_shape() {
auto update_output_layout = [&](layout& layout, size_t idx) {
layout.data_padding = padding::max(_node->get_primitive()->output_paddings[idx], layout.data_padding);
if (_impl_params->get_output_layout(idx) != layout) {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
<< " now: " << layout.to_short_string() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
<< " now: " << layout.to_short_string() << std::endl;
set_shape_change();
}
_impl_params->output_layouts[idx] = layout;
@ -265,16 +260,12 @@ void primitive_inst::realloc_if_needed() {
bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;
if (can_reuse_buffer) {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": reuse previously allocated output buffer" << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
} else {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": realloc output memory. "
<< " Current buffer_size=" << max_output_layout_size
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
<< " Current buffer_size=" << max_output_layout_size
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
_outputs = allocate_outputs(&updated_params);
// TODO : need to handle multiple outputs
max_output_layout_size = updated_params.output_layouts[0].count();
@ -285,7 +276,6 @@ void primitive_inst::realloc_if_needed() {
void primitive_inst::update_impl() {
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
GPU_DEBUG_GET_INSTANCE(debug_config);
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
auto extend_to_6d = [this](ov::PartialShape ps) -> std::vector<size_t> {
// For shape_of we extend shape with 1-s to 6d rank to make kernel simpler
@ -321,7 +311,7 @@ void primitive_inst::update_impl() {
return seed;
};
auto update_shape_info = [this, extend_to_6d, debug_config, prev_impl_str](const kernel_impl_params& params) {
auto update_shape_info = [this, extend_to_6d, prev_impl_str](const kernel_impl_params& params) {
mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
size_t offset = 0;
for (size_t i = 0; i < _node->get_dependencies().size(); i++) {
@ -337,17 +327,14 @@ void primitive_inst::update_impl() {
for (size_t j = 0; j < output_shape.size(); j++)
lock[offset++] = static_cast<int32_t>(output_shape[j]);
}
GPU_DEBUG_IF(debug_config->verbose >= 4) {
std::stringstream s;
s << "shapes: ";
for (size_t i = 0; i < offset; i++)
s << lock[i] << " ";
GPU_DEBUG_COUT << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl;
}
std::stringstream s;
s << "shapes: ";
for (size_t i = 0; i < offset; i++)
s << lock[i] << " ";
GPU_DEBUG_TRACE_DETAIL << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl;
};
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
GPU_DEBUG_GET_INSTANCE(debug_config);
// Update param if fake_alignment is available
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
auto layout_key = get_layout_key(updated_params);
@ -359,10 +346,7 @@ void primitive_inst::update_impl() {
if (has_cached_impl) {
_impl = cache.get(layout_key)->clone();
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
}
}
if (!has_cached_impl) {
@ -402,10 +386,9 @@ void primitive_inst::update_impl() {
kernels_cache.reset();
std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
cache.add(layout_key, _impl->clone());
GPU_DEBUG_IF(debug_config->verbose >= 4) {
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
GPU_DEBUG_COUT << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
}
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
GPU_DEBUG_TRACE_DETAIL << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
}
}
@ -436,14 +419,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
subgraph->set_input_data(d.first->id(), actual_mem);
}
}
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << "[Start] Executing unfused subgraph of " << id() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << "[Start] Executing unfused subgraph of " << id() << std::endl;
auto outputs = subgraph->execute(events);
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << "[End] Finished executing unfused subgraph of " << id() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << "[End] Finished executing unfused subgraph of " << id() << std::endl;
auto last_fd = _impl_params->fused_desc.back();
auto last_prim_id = last_fd.desc->id;
@ -476,28 +454,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
}
on_execute();
GPU_DEBUG_IF(debug_config->verbose >= 1) {
std::ostringstream in_addr;
// buffer_ptr() only support usm_memory
for (size_t i = 0; i < this->dependencies().size(); i++) {
auto in_mem = dep_memory_ptr(i);
if (in_mem) {
in_addr << in_mem->buffer_ptr();
if (i < this->dependencies().size() - 1) {
in_addr << ", ";
}
}
}
auto out_mem = output_memory_ptr();
auto out_alloc_type = out_mem ? out_mem->get_allocation_type() : allocation_type::unknown;
auto out_ptr = out_mem ? out_mem->buffer_ptr() : nullptr;
auto impl_name = _impl->get_kernel_name();
GPU_DEBUG_COUT << id() << ": execute " << impl_name << ". Memory type: "
<< out_alloc_type << ", in_usm("
<< in_addr.str() << "), out_usm("
<< out_ptr << ")" << std::endl;
}
GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << std::endl;
if (_exec_deps.empty() && dependencies.empty()) {
dependencies = events;
@ -695,10 +652,7 @@ void primitive_inst::allocate_internal_buffers(void) {
// allocate intermediate memory for the updated layout of buffer
std::vector<memory::cptr> intermediates_memory;
for (auto layout : ibuf_layouts) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << _node->id() << ": internal buf]" << std::endl;
}
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf]" << std::endl;
auto alloc_type = allocation_type::unknown;
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
alloc_type = engine.get_preferred_memory_allocation_type();
@ -719,8 +673,6 @@ event::ptr primitive_inst::update_weights() {
if (!weightable_node)
return nullptr;
GPU_DEBUG_GET_INSTANCE(debug_config);
auto& weights_params = _impl->_weights_reorder_params;
bool requires_reorder = weights_params.engine != kernel_selector::GenericKernelParams::Engine::NONE &&
(!_impl_params->reordered_weights || _impl_params->reordered_weights->get_layout() != from_weights_tensor(weights_params.dest));
@ -744,17 +696,13 @@ event::ptr primitive_inst::update_weights() {
if (layout_key != "") {
auto& cache = get_network().get_in_mem_kernels_cache();
if (cache.has(layout_key)) {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl;
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
kernel = cache.get(layout_key);
} else {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl;
auto& kernels_cache = get_network().get_kernels_cache();
auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
kernels_cache.compile();
@ -768,9 +716,7 @@ event::ptr primitive_inst::update_weights() {
bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count();
if (can_reuse) {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": reuse weights memory" << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights memory" << std::endl;
_impl_params->reordered_weights = engine.reinterpret_buffer(*_impl_params->reordered_weights, expected_layout);
} else {
auto alloc_type = engine.get_preferred_memory_allocation_type();
@ -783,6 +729,7 @@ event::ptr primitive_inst::update_weights() {
stream.set_arguments(*kernel, weights_params.clKernel->params, args);
auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true);
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
stream.wait_for_events({ev});
}
@ -851,15 +798,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false;
auto use_lockable_memory = is_output_buffer(_node) || is_cpu || is_any_user_cpu(_node.get_users()) ||
!_engine.supports_allocation(allocation_type::usm_device);
GPU_DEBUG_GET_INSTANCE(debug_config);
const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
const auto& alloc_type = use_lockable_memory ? lockable_mem_type
: usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;
if ((is_internal && (_node.can_be_optimized() || _node.is_type<generic_layer>())) || (memory_reuse_by_user == false)) {
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
}
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
return get_memory_from_pool(_engine,
layout,
_node.id(),
@ -868,21 +812,15 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
false);
} else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
_engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
}
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
return _engine.allocate_memory(layout, allocation_type::usm_device, false);
} else if (is_internal && !_node.is_output() && _node.is_type<input_layout>()) {
// Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive
// or just reuse primitive's memory
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << _node.id() << ": constant]" << std::endl;
}
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
return _engine.allocate_memory(layout, alloc_type, false);
} else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
}
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
return _engine.allocate_memory(layout, alloc_type);
} else {
return get_memory_from_pool(_engine,
@ -940,10 +878,7 @@ std::string primitive_inst::generic_to_string(program_node const& node, const ch
}
cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": Use unfused subgraph due to unexpected fusions\n";
}
GPU_DEBUG_TRACE_DETAIL << id() << ": Use unfused subgraph due to unexpected fusions\n";
if (!_unfused_subgraph) {
topology t;

View File

@ -719,17 +719,12 @@ void program::transfer_memory_to_device() {
if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl;
}
GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
// Allocate and transfer memory
auto device_mem = mem.get_engine()->allocate_memory(data_node_layout, allocation_type::usm_device, false);
device_mem->copy_from(get_stream(), mem);
data_node.attach_memory(device_mem);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl;
}
GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
const_cast<memory::ptr&>(data_node.get_primitive()->mem).reset();
// TODO: Do we need finish call here? Maybe call it in network::execute() ?
get_stream().finish();
@ -1656,7 +1651,6 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
});
auto& engine = get_engine();
int64_t host_alloc = 0;
GPU_DEBUG_GET_INSTANCE(debug_config);
// just to prevent the memories from being freed during allocation
std::unordered_set<memory::ptr> allocated_mem_ptrs;
for (const auto& node : nodes_to_allocate) {
@ -1676,10 +1670,8 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
if (engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu)
total_host_alloc_size += engine.get_used_device_memory(allocation_type::usm_device);
if ((cur_vmem != -1 && total_host_alloc_size > cur_vmem * 0.5) || (total_host_alloc_size >= max_global_mem_size)) {
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory ("
<< cur_vmem << ")" << std::endl;
}
GPU_DEBUG_INFO << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory ("
<< cur_vmem << ")" << std::endl;
return {-1L, -1L};
}
#endif

View File

@ -246,23 +246,18 @@ bool program_node::is_detached(bool whole_branch) {
}
layout program_node::calc_output_layout() const {
GPU_DEBUG_GET_INSTANCE(debug_config);
bool allow_new_shape_infer =
get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
if (allow_new_shape_infer) {
auto out_layouts = type()->calc_output_layouts(*this, *get_kernel_impl_params());
if (!out_layouts.empty()) {
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl;
return out_layouts[0];
}
}
auto res = type()->calc_output_layout(*this, *get_kernel_impl_params());
GPU_DEBUG_IF(debug_config->verbose >= 4) {
GPU_DEBUG_COUT << id() << ": calc_output_layout:" << res << std::endl;
}
GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout:" << res << std::endl;
return res;
}
@ -528,8 +523,6 @@ bool program_node::has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& a
dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr,
bool& optimization_is_completed) {
GPU_DEBUG_GET_INSTANCE(debug_config);
// Create new dnnl::post_ops object which will be filled inside the optimization process
dnnl::post_ops optimized_p_ops;
@ -673,22 +666,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
int64_t prev_post_op_idx = 0;
bool optimization_done = false;
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << "================================================" << std::endl;
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
}
GPU_DEBUG_TRACE << "================================================" << std::endl;
GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
remove_optimized_prefix(cur_post_ops);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl;
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl;
}
GPU_DEBUG_TRACE << "remove optimized prefix ------------------------" << std::endl;
GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
GPU_DEBUG_TRACE << "----------------------------------->>>>>>>>>>>>>" << std::endl;
// Get post-ops size for current node
int64_t post_ops_size = cur_post_ops.size();
@ -710,8 +699,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
auto cur_type = cur_post_ops[cur_post_op_idx].op_type;
auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
GPU_DEBUG_IF(debug_config->verbose >= 3)
GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
GPU_DEBUG_TRACE << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
// Ignore optimized operations for "previous" operation in our operation pair
while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) {
@ -728,8 +716,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
cur_type = cur_post_ops[cur_post_op_idx].op_type;
}
GPU_DEBUG_IF(debug_config->verbose >= 3)
GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
GPU_DEBUG_TRACE << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
@ -766,10 +753,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
bool cur_ops_pair_is_optimized = false;
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type
<< ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
}
GPU_DEBUG_TRACE << "prev_idx: " << prev_idx << " " << prev_type
<< ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
if (can_try_optimize) {
if (eltw_and_eltw) {
@ -1005,12 +990,10 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
remove_optimized_prefix(cur_post_ops);
}
GPU_DEBUG_IF(debug_config->verbose >= 3) {
GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
GPU_DEBUG_COUT << "------------------------------------------------" << std::endl;
}
GPU_DEBUG_TRACE << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
for (size_t i = 0; i < cur_post_ops.size(); i++)
GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
GPU_DEBUG_TRACE << "------------------------------------------------" << std::endl;
add_onednn_fused_primitives(cur_post_ops);

View File

@ -107,12 +107,8 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
}
} catch (std::runtime_error& ex) {
// we have to handle it in order to avoid exception in KernelSelector as much we can
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
<< kernelName << " - " << ex.what() << std::endl;
}
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
}
}
@ -202,12 +198,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
}
} catch (std::runtime_error& ex) {
// we have to handle it in order to avoid exception in KernelSelector as much we can
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
<< kernelName << " - " << ex.what() << std::endl;
}
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
}
}
}
@ -231,12 +223,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
}
} catch (std::runtime_error& ex) {
// we have to handle it in order to avoid exception in KernelSelector as much we can
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 3) {
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
<< kernelName << " - " << ex.what() << std::endl;
}
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
}
}
}

View File

@ -493,7 +493,7 @@ void InferRequest::enqueue() {
// If dump layers path is set, only runs first inference.
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl;
exit(0);
}
}
@ -760,10 +760,7 @@ void InferRequest::allocate_inputs() {
OPENVINO_ASSERT(litr != inputLayouts.end(), "[GPU] Input layout for ", name, " is not found");
const auto input_layout = litr->second;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
}
GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl;
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
TensorDesc desc_fp32 = desc;
desc_fp32.setPrecision(Precision::FP32);
@ -796,10 +793,7 @@ void InferRequest::allocate_outputs() {
if (output_layout.is_static())
desc.setDims(m_graph->GetOutputSize(no.first));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
}
GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl;
outputsMap[no.first] = outputID;
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 ||

View File

@ -769,7 +769,7 @@ void InferRequestLegacy::enqueue() {
// If dump layers path is set, only runs first inference.
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl;
exit(0);
}
}
@ -1031,10 +1031,7 @@ void InferRequestLegacy::allocate_inputs() {
IE_THROW() << "Input layout for " << name << " is not found";
}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
}
GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl;
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
TensorDesc desc_fp32 = desc;
desc_fp32.setPrecision(Precision::FP32);
@ -1088,10 +1085,7 @@ void InferRequestLegacy::allocate_outputs() {
// Can be removed once 76176 is resolved.
desc.setDims(m_graph->GetOutputSize(no.first));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
}
GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl;
outputsMap[no.first] = outputID;
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 ||

View File

@ -201,10 +201,7 @@ void createClDnnConstant(Program& p, const ngraph::Shape& constDims, const std::
p.primitive_ids[initialconstPrimID] = constPrimID;
p.profiling_ids.push_back(initialconstPrimID);
} else {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << initialconstPrimID << ": constant]" << std::endl;
}
GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl;
cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false);
auto& stream = p.GetEngine().get_program_stream();
cldnn::mem_lock<char> lock{mem, stream};

View File

@ -73,10 +73,7 @@ static void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr<ngr
cldnn::format::get_default_format(op->get_output_shape(1).size()),
tensor_from_dims(op->get_output_shape(1)));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
}
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout));
cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";

View File

@ -93,7 +93,6 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
auto outputIndices = op->get_output_partial_shape(0)[0].get_length();
std::vector<cldnn::memory::ptr> shared_memory;
GPU_DEBUG_GET_INSTANCE(debug_config);
switch (num_outputs) {
case 3: {
auto mutable_precision_second = op->get_output_element_type(2);
@ -105,9 +104,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
cldnn::format::get_default_format(op->get_output_shape(2).size()),
tensor_from_dims(op->get_output_shape(2)));
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
}
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));
cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
@ -123,9 +120,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
cldnn::format::bfyx,
cldnn::tensor(static_cast<int32_t>(outputIndices), 3, 1, 1));
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
}
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));
cldnn::primitive_id non_max_supression_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";

View File

@ -64,10 +64,7 @@ static void CreateProposalOp(Program& p, const std::shared_ptr<ngraph::op::v0::P
cldnn::format::get_default_format(op->get_output_shape(1).size()),
tensor_from_dims(op->get_output_shape(1)));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
}
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write";

View File

@ -65,10 +65,7 @@ static void CreateTopKOp(Program& p, const std::shared_ptr<ngraph::op::v1::TopK>
cldnn::format::get_default_format(op->get_output_shape(1).size()),
tensor_from_dims(op->get_output_shape(1)));
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
}
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write";

View File

@ -686,25 +686,20 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
auto closest_pow_of_2 = [] (float x) {
return pow(2, floor(std::log(x)/std::log(2)));
};
GPU_DEBUG_GET_INSTANCE(debug_config);
auto model_param = options.find(ov::hint::model.name());
if (model_param == options.end()) {
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
}
GPU_DEBUG_INFO << "[OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
return decltype(ov::optimal_batch_size)::value_type {static_cast<unsigned int>(1)};
}
std::shared_ptr<ngraph::Function> model;
try {
model = model_param->second.as<std::shared_ptr<ngraph::Function>>();
} catch (...) {
IE_THROW() << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr<ov::Model> type";
}
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "DEVICE_INFO:"
<< "gfx_version.major, " << device_info.gfx_ver.major
<< "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
IE_THROW() << "[OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr<ov::Model> type";
}
GPU_DEBUG_INFO << "DEVICE_INFO:"
<< "gfx_version.major, " << device_info.gfx_ver.major
<< "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
static std::map<cldnn::gfx_version, size_t> gen_kbytes_per_bank = {
{{12, 0, 0}, 480}, // TGL
{{12, 1, 0}, 2048}, // DG1
@ -722,14 +717,12 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
? next_pow_of_2(device_info.num_sub_slices_per_slice)
: 2 * device_info.num_sub_slices_per_slice;
L3_cache_size = kbytes_per_bank * 1024 * num_banks_per_slice * device_info.num_slices;
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "DEVICE_INFO:"
<< "num_slices " << device_info.num_slices
<< ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
<< ", num_banks_per_slice " << num_banks_per_slice
<< ", gen_kbytes_per_bank : " << kbytes_per_bank
<< ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
}
GPU_DEBUG_INFO << "DEVICE_INFO:"
<< "num_slices " << device_info.num_slices
<< ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
<< ", num_banks_per_slice " << num_banks_per_slice
<< ", gen_kbytes_per_bank : " << kbytes_per_bank
<< ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
}
Config config = _impl->m_configs.GetConfig(device_id);
auto networkCloned = CloneAndTransformNetwork(CNNNetwork(model), config);
@ -744,11 +737,9 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
unsigned int closest = closest_pow_of_2(max_batch_size);
batch = std::min(closest, batch);
batch = std::min(256u, batch); //batch 256 is a max
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << memPressure.max_mem_tolerance << std::endl;
GPU_DEBUG_COUT << "MAX_BATCH: " << max_batch_size << std::endl;
GPU_DEBUG_COUT << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
}
GPU_DEBUG_INFO << memPressure.max_mem_tolerance << std::endl;
GPU_DEBUG_INFO << "MAX_BATCH: " << max_batch_size << std::endl;
GPU_DEBUG_INFO << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
return decltype(ov::optimal_batch_size)::value_type {batch};
} else if (name == ov::device::uuid) {
ov::device::UUID uuid = {};
@ -815,17 +806,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
}
int64_t available_device_mem = device_info.max_global_mem_size - occupied_device_mem;
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
<< " (occupied: " << occupied_device_mem << ")" << std::endl;
}
GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
<< " (occupied: " << occupied_device_mem << ")" << std::endl;
int64_t max_batch_size = 1;
if (options.find(ov::hint::model.name()) == options.end()) {
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
}
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
}
@ -849,17 +836,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
}
}
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;
}
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;
auto available_device_mem_it = options.find(ov::intel_gpu::hint::available_device_mem.name());
if (available_device_mem_it != options.end()) {
if (available_device_mem_it->second.is<int64_t>()) {
available_device_mem = std::min(static_cast<int64_t>(available_device_mem), available_device_mem_it->second.as<int64_t>());
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
}
GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
} else {
IE_THROW() << "[GPU_MAX_BATCH_SIZE] bad casting: ov::intel_gpu::hint::available_device_mem should be int64_t type";
}
@ -910,9 +893,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
const auto& shape = input->get_partial_shape();
// currently no plugin support batched execution for dynamic networks
if (shape.is_dynamic()) {
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl;
}
GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl;
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
}
@ -921,10 +902,8 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
if (ov::DimensionTracker::get_label(shape[s])) {
// batched dim for the input
auto batched_input_id = ngraph::op::util::get_ie_output_name(params[input_id]->output(0));
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id
<< "[" << s << "]" << std::endl;
}
GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id
<< "[" << s << "]" << std::endl;
batched_inputs.insert(std::make_pair(batched_input_id, s));
}
}
@ -932,9 +911,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
}
if (!batched_inputs.size()) {
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl;
}
GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl;
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
}
@ -944,9 +921,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
shapes[input.first][input.second] = base_batch_size;
cloned_network.reshape(shapes);
} catch (...) {
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl;
}
GPU_DEBUG_INFO << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl;
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
}
@ -962,15 +937,11 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
static_cast<int64_t>(static_cast<int64_t>(available_device_mem) - device_memory_usage.first));
int64_t mem_per_batch = std::max(static_cast<int64_t>(1L), (device_memory_usage.second / static_cast<int64_t>(base_batch_size)));
max_batch_size = mem_for_general / (mem_per_batch * static_cast<int64_t>(n_streams));
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size << std::endl;
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first << std::endl;
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second << std::endl;
}
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size << std::endl;
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first << std::endl;
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second << std::endl;
} catch (std::exception& e) {
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl;
}
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl;
}
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
} else if (isModelCachingEnabled && name == METRIC_KEY(IMPORT_EXPORT_SUPPORT)) {

View File

@ -375,11 +375,8 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st
void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ngraph::Node>& op) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::CreateSingleLayerPrimitive");
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
<< "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
}
GPU_DEBUG_LOG << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
<< "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
bool is_created = false;
const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();

View File

@ -17,6 +17,8 @@ const char *debug_configuration::prefix = "GPU_Debug: ";
#ifdef GPU_DEBUG_CONFIG
#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix
template<typename T>
void print_option(std::string option_name, T option_value) {
GPU_DEBUG_COUT << "Config " << option_name << " = " << option_value << std::endl;

View File

@ -265,11 +265,8 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
default:
throw std::runtime_error("Invalid engine type");
}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 1) {
const auto& info = device->get_info();
GPU_DEBUG_COUT << "Selected Device: " << info.dev_name << std::endl;
}
const auto& info = device->get_info();
GPU_DEBUG_INFO << "Selected Device: " << info.dev_name << std::endl;
return ret;
}

View File

@ -299,12 +299,9 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
dump_file << "*/\n";
}
if (!err_log.empty()) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose) {
std::cout << "-------- OpenCL build error" << std::endl;
std::cout << err_log << std::endl;
std::cout << "-------- End of OpenCL build error" << std::endl;
}
GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl;
GPU_DEBUG_INFO << err_log << std::endl;
GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl;
std::stringstream err_ss(err_log);
std::string line;
int cnt = 0;

View File

@ -21,12 +21,9 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, bool
: _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), _type(type), _reused(reused) {
if (!_reused && _engine) {
_engine->add_memory_used(_bytes_count, type);
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
<< " (current=" << _engine->get_used_device_memory(type) << ";"
<< " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
}
GPU_DEBUG_LOG << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
<< " (current=" << _engine->get_used_device_memory(type) << ";"
<< " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
}
}
@ -35,12 +32,9 @@ memory::~memory() {
try {
_engine->subtract_memory_used(_bytes_count, _type);
} catch (...) {}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
<< " (current=" << _engine->get_used_device_memory(_type) << ";"
<< " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
}
GPU_DEBUG_LOG << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
<< " (current=" << _engine->get_used_device_memory(_type) << ";"
<< " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
}
}

View File

@ -137,10 +137,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
++it;
}
}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl;
}
GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
// didn't find anything for you? create new resource
auto mem = alloc_memory(layout, type);
{
@ -179,10 +176,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
memory_record({{id, network_id}}, mem, network_id, type));
return mem;
}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl;
}
GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
auto mem = alloc_memory(layout, type);
std::list<memory_record> list = {memory_record({{id, network_id}}, mem, network_id, type)};
_padded_pool.emplace(layout, std::move(list));

View File

@ -76,10 +76,7 @@ std::vector<cl_queue_properties> command_queues_builder::get_properties(const cl
using cmp_t = std::common_type<decltype(queue_properties), typename std::underlying_type<cl::QueueProperties>::type>::type;
if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
out_of_order = false;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 1) {
GPU_DEBUG_COUT << "Requested out-of-order queue is not supported by current device. Use in-order instead";
}
GPU_DEBUG_INFO << "Requested out-of-order queue is not supported by current device. Use in-order instead\n";
}
}

View File

@ -180,10 +180,10 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
if (period.stage == instrumentation::profiling_stage::executing) {
GPU_DEBUG_COUT << "Multi-kernel time: ";
GPU_DEBUG_TRACE << "Multi-kernel time: ";
for (auto& duration : all_durations[period.stage])
std::cout << " " << (duration.second - duration.first) / 1000;
std::cout << " Total " << sum / 1000 << std::endl;
GPU_DEBUG_TRACE << " " << (duration.second - duration.first) / 1000;
GPU_DEBUG_TRACE << " Total " << sum / 1000 << std::endl;
}
}

View File

@ -338,10 +338,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type) {
if (type != mem_lock_type::read) {
throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type.");
}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose >= 2) {
GPU_DEBUG_COUT << "Copy usm_device buffer to host buffer." << std::endl;
}
GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
_host_buffer.allocateHost(_bytes_count);
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, CL_TRUE);
_mapped_ptr = _host_buffer.get();

View File

@ -6,6 +6,7 @@
#include "ocl_event.hpp"
#include "ocl_user_event.hpp"
#include "ocl_command_queues_builder.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "ocl_kernel.hpp"
#include "ocl_common.hpp"
@ -49,6 +50,27 @@ inline cl::NDRange toNDRange(const std::vector<size_t>& v) {
}
}
cl_int set_kernel_arg(ocl_kernel_type& kernel, uint32_t idx, cldnn::memory::cptr mem) {
if (!mem)
return CL_INVALID_ARG_VALUE;
if (mem->get_layout().format.is_image_2d()) {
auto buf = std::dynamic_pointer_cast<const ocl::gpu_image2d>(mem)->get_buffer();
GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (image) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
return kernel.setArg(idx, buf);
} else if (memory_capabilities::is_usm_type(mem->get_allocation_type())) {
auto buf = std::dynamic_pointer_cast<const ocl::gpu_usm>(mem)->get_buffer();
GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (usm) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
return kernel.setArgUsm(idx, buf);
} else {
auto buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(mem)->get_buffer();
GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (buffer) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
return kernel.setArg(idx, buf);
}
return CL_INVALID_ARG_VALUE;
}
void set_arguments_impl(ocl_kernel_type& kernel,
const arguments_desc& args,
const kernel_arguments_data& data) {
@ -59,121 +81,44 @@ void set_arguments_impl(ocl_kernel_type& kernel,
switch (args[i].t) {
case args_t::INPUT:
if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) {
const auto& input_mem = data.inputs[args[i].index];
if (input_mem) {
if (input_mem->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(input_mem)->get_buffer());
else if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.inputs[args[i].index]);
}
break;
case args_t::INPUT_OF_FUSED_PRIMITIVE:
if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) {
const auto& input_mem = data.fused_op_inputs[args[i].index];
if (input_mem) {
if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]);
}
break;
case args_t::INTERNAL_BUFFER:
if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) {
const auto& input_mem = data.intermediates[args[i].index];
if (input_mem) {
if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]);
}
break;
case args_t::OUTPUT:
if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) {
const auto& output_mem = data.outputs[args[i].index];
if (output_mem) {
if (output_mem->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(output_mem)->get_buffer());
else if (memory_capabilities::is_usm_type(output_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(output_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(output_mem)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.outputs[args[i].index]);
}
break;
case args_t::WEIGHTS:
if (data.weights) {
if (data.weights->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(data.weights)->get_buffer());
else if (memory_capabilities::is_usm_type(data.weights->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.weights);
break;
case args_t::BIAS:
if (data.bias) {
if (memory_capabilities::is_usm_type(data.bias->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.bias)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.bias)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.bias);
break;
case args_t::WEIGHTS_ZERO_POINTS:
if (data.weights_zero_points) {
if (memory_capabilities::is_usm_type(data.weights_zero_points->get_allocation_type()))
status = kernel.setArgUsm(
i,
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights_zero_points)->get_buffer());
else
status = kernel.setArg(
i,
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights_zero_points)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.weights_zero_points);
break;
case args_t::ACTIVATIONS_ZERO_POINTS:
if (data.activations_zero_points) {
if (memory_capabilities::is_usm_type(data.activations_zero_points->get_allocation_type()))
status = kernel.setArgUsm(
i,
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.activations_zero_points)->get_buffer());
else
status = kernel.setArg(
i,
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.activations_zero_points)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.activations_zero_points);
break;
case args_t::COMPENSATION:
if (data.compensation) {
if (memory_capabilities::is_usm_type(data.compensation->get_allocation_type()))
status = kernel.setArgUsm(
i,
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.compensation)->get_buffer());
else
status = kernel.setArg(
i,
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.compensation)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.compensation);
break;
case args_t::SCALE_TABLE:
if (data.scale_table) {
if (memory_capabilities::is_usm_type(data.scale_table->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.scale_table)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.scale_table)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.scale_table);
break;
case args_t::SLOPE:
if (data.slope) {
if (memory_capabilities::is_usm_type(data.slope->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.slope)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.slope)->get_buffer());
}
status = set_kernel_arg(kernel, i, data.slope);
break;
case args_t::SPLIT:
status = kernel.setArg(i, data.split);
@ -217,48 +162,17 @@ void set_arguments_impl(ocl_kernel_type& kernel,
}
}
break;
case args_t::RECURRENT: // RNN/LSTM/GRU layers
if (data.recurrent) {
if (data.recurrent->get_layout().format.is_image_2d())
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.recurrent).get_buffer());
else if (memory_capabilities::is_usm_type(data.recurrent->get_allocation_type()))
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.recurrent).get_buffer());
else
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.recurrent).get_buffer());
}
case args_t::RECURRENT:
status = set_kernel_arg(kernel, i, data.recurrent);
break;
case args_t::HIDDEN: // RNN/LSTM/GRU layers
if (data.hidden) {
if (data.hidden->get_layout().format.is_image_2d())
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.hidden).get_buffer());
else if (memory_capabilities::is_usm_type(data.hidden->get_allocation_type()))
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.hidden).get_buffer());
else
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.hidden).get_buffer());
}
case args_t::HIDDEN:
status = set_kernel_arg(kernel, i, data.hidden);
break;
case args_t::CELL: // LSTMlayers
if (data.cell) {
if (data.cell->get_layout().format.is_image_2d())
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.cell).get_buffer());
else if (memory_capabilities::is_usm_type(data.cell->get_allocation_type()))
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.cell).get_buffer());
else
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.cell).get_buffer());
}
case args_t::CELL:
status = set_kernel_arg(kernel, i, data.cell);
break;
case args_t::SHAPE_INFO:
if (args[i].index == 0 && data.shape_info) {
const auto& shape_info_mem = data.shape_info;
if (shape_info_mem) {
if (shape_info_mem->get_layout().format.is_image_2d())
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(shape_info_mem)->get_buffer());
else if (memory_capabilities::is_usm_type(shape_info_mem->get_allocation_type()))
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(shape_info_mem)->get_buffer());
else
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(shape_info_mem)->get_buffer());
}
}
status = set_kernel_arg(kernel, i, data.shape_info);
break;
default:
break;
@ -359,6 +273,7 @@ void ocl_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args
auto& kern = ocl_kernel.get_handle();
try {
GPU_DEBUG_TRACE_DETAIL << "Set arguments for primitive: " << args_desc.layerID << " (" << kern.get() << ")\n";
set_arguments_impl(kern, args_desc.arguments, args);
} catch (cl::Error const& err) {
throw ocl_error(err);