[GPU] Refactored GPU verbose utils (#14549)
* [GPU] Refactored GPU verbose utils * [GPU] More refacoring * Fix build
This commit is contained in:
parent
e306cbc67a
commit
6bc4f00695
@ -7,22 +7,63 @@
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_gpu {
|
||||
|
||||
// Verbose log levels:
|
||||
// DISABLED - silent mode (Default)
|
||||
// INFO - Minimal verbose:
|
||||
// * May log basic info about device, plugin configuration, model and execution statistics
|
||||
// * Mustn't log any info that depend on neither number of iterations or number of layers in the model
|
||||
// * Minimal impact on both load time and inference time
|
||||
// LOG - Enables graph optimization verbose:
|
||||
// * Includes info from log level INFO
|
||||
// * May log info about applied graph transformations, memory allocations and other model compilation time steps
|
||||
// * May impact compile_model() execution time
|
||||
// * Minimal impact on inference time
|
||||
// TRACE - Enables basic execution time verbose
|
||||
// * Includes info from log level LOG
|
||||
// * May log info during model execution
|
||||
// * May log short info about primitive execution
|
||||
// * May impact network execution time
|
||||
// TRACE_DETAIL - Max verbosity
|
||||
// * Includes info from log level TRACE
|
||||
// * May log any stage and print detailed info about each execution step
|
||||
enum class LogLevel : int8_t {
|
||||
DISABLED = 0,
|
||||
INFO = 1,
|
||||
LOG = 2,
|
||||
TRACE = 3,
|
||||
TRACE_DETAIL = 4
|
||||
};
|
||||
|
||||
} // namespace intel_gpu
|
||||
} // namespace ov
|
||||
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
#define GPU_DEBUG_IF(cond) if (cond)
|
||||
#define GPU_DEBUG_PROFILED_STAGE(stage) \
|
||||
auto stage_prof = cldnn::instrumentation::profiled_stage<primitive_inst>(\
|
||||
!cldnn::debug_configuration::get_instance()->dump_profiling_data.empty(), *this, stage)
|
||||
#define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val) stage_prof.set_cache_hit(val)
|
||||
|
||||
#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
|
||||
std::cout << cldnn::debug_configuration::prefix
|
||||
#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
|
||||
#else
|
||||
#define GPU_DEBUG_IF(cond) if (0)
|
||||
#define GPU_DEBUG_PROFILED_STAGE(stage)
|
||||
#define GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(val)
|
||||
#define GPU_DEBUG_LOG_RAW(min_verbose_level) if (0) std::cout << cldnn::debug_configuration::prefix
|
||||
#endif
|
||||
|
||||
#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix
|
||||
// Macro below is inserted to avoid unused variable warning when GPU_DEBUG_CONFIG is OFF
|
||||
#define GPU_DEBUG_GET_INSTANCE(name) auto name = cldnn::debug_configuration::get_instance(); (void)(name);
|
||||
|
||||
#define GPU_DEBUG_INFO GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::INFO)
|
||||
#define GPU_DEBUG_LOG GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::LOG)
|
||||
#define GPU_DEBUG_TRACE GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE)
|
||||
#define GPU_DEBUG_TRACE_DETAIL GPU_DEBUG_LOG_RAW(ov::intel_gpu::LogLevel::TRACE_DETAIL)
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
|
@ -22,10 +22,8 @@
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \
|
||||
GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; }
|
||||
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \
|
||||
GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Replace node: " << (id) << std::endl; }
|
||||
#define LOG_NODE_REMOVAL(id) GPU_DEBUG_LOG_PASS << "Remove node: " << (id) << std::endl;
|
||||
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_LOG_PASS << "Replace node: " << (id) << std::endl;
|
||||
|
||||
|
||||
remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
|
||||
@ -34,7 +32,6 @@ remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, b
|
||||
remove_output_reorders(remove_output_reorders) {}
|
||||
|
||||
void remove_redundant_reorders::run(program& p) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
auto update_implementation = [&](program_node& node) {
|
||||
if (!update_implementations)
|
||||
return;
|
||||
|
@ -34,8 +34,6 @@ void reorder_inputs::run(program& p) { run(p, _lo, _rf); }
|
||||
namespace {
|
||||
|
||||
std::map<program_node*, format::type> get_preferred_formats(program& p, layout_optimizer& lo) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
std::map<program_node*, format::type> fmt_map;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
@ -59,9 +57,7 @@ std::map<program_node*, format::type> get_preferred_formats(program& p, layout_o
|
||||
if (onednn_impls_counter < 1 && lo.get_optimization_attributes().use_onednn_impls) {
|
||||
should_update_fmt_map = true;
|
||||
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 0);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "The return to clDNN implementations" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Disable oneDNN implementations globally" << std::endl;
|
||||
}
|
||||
|
||||
if (should_update_fmt_map)
|
||||
@ -277,10 +273,7 @@ void propagate_formats_rec(std::map<program_node*, format::type>& fmt_map,
|
||||
|
||||
fmt = travel_direction_wrapper<dir>::first(first_fmt, second_fmt);
|
||||
fmt_map.at(node) = fmt;
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
|
||||
|
||||
for (auto next : travel_direction_wrapper<dir>::next_nodes(node)) {
|
||||
if (!next->is_in_data_flow())
|
||||
@ -312,10 +305,7 @@ void propagate_formats_rec<direction_e::backwards>(std::map<program_node*, forma
|
||||
|
||||
fmt = travel_direction_wrapper<direction_e::backwards>::first(first_fmt, second_fmt);
|
||||
fmt_map.at(node) = fmt;
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Propagate_formats_rec: " << node->id() << " - " << fmt_to_str(fmt) << std::endl;
|
||||
|
||||
for (auto next : travel_direction_wrapper<direction_e::backwards>::next_nodes(node)) {
|
||||
if (!next.first->is_in_data_flow())
|
||||
@ -541,7 +531,6 @@ const char *dir_msg(direction_e dir) {
|
||||
// If there is layout mismatch between two layers, add reorder
|
||||
template <direction_e dir>
|
||||
void insert_reorders_in_dir(program& p, const std::map<program_node*, format::type>& fmt_map, reorder_factory& rf, layout_optimizer& lo, program_node* node) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
auto fmt = fmt_map.at(node);
|
||||
|
||||
auto next_cpy = travel_direction_wrapper<dir>::next_nodes(node);
|
||||
@ -565,10 +554,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
|
||||
in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
|
||||
out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## "
|
||||
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## "
|
||||
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
|
||||
|
||||
if (in_layout.format == format::any || out_layout.format == format::any)
|
||||
continue;
|
||||
@ -580,10 +567,8 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
|
||||
auto reorder = reorder_pair.first;
|
||||
if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
|
||||
auto& reorder_node = p.get_or_create(reorder);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id()
|
||||
<< " Reorder is added" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id()
|
||||
<< " Reorder is added" << std::endl;
|
||||
p.add_intermediate(reorder_node,
|
||||
*travel_direction_wrapper<dir>::second(node, next),
|
||||
*travel_direction_wrapper<dir>::first(node, next),
|
||||
@ -595,7 +580,6 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
|
||||
template <>
|
||||
void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<program_node*, format::type>& fmt_map,
|
||||
reorder_factory& rf, layout_optimizer& lo, program_node* node) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
auto fmt = fmt_map.at(node);
|
||||
|
||||
auto next_cpy = travel_direction_wrapper<direction_e::backwards>::next_nodes(node);
|
||||
@ -619,11 +603,9 @@ void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<p
|
||||
in_layout.format = get_target_output_format(lo, fmt_map, predecessor, successor);
|
||||
out_layout.format = get_target_input_format(lo, fmt_map, successor, predecessor);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards)
|
||||
<< " " << node->id() << " --> " << next.first->id() << " ## "
|
||||
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards)
|
||||
<< " " << node->id() << " --> " << next.first->id() << " ## "
|
||||
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
|
||||
|
||||
if (in_layout.format == format::any || out_layout.format == format::any)
|
||||
continue;
|
||||
@ -635,10 +617,8 @@ void insert_reorders_in_dir<direction_e::backwards>(program& p, const std::map<p
|
||||
auto reorder = reorder_pair.first;
|
||||
if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
|
||||
auto& reorder_node = p.get_or_create(reorder);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << " " << reorder_node.id()
|
||||
<< " Reorder is added" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << __func__ << ":" << __LINE__ << ":" << dir_msg(direction_e::backwards) << " " << reorder_node.id()
|
||||
<< " Reorder is added" << std::endl;
|
||||
p.add_intermediate(reorder_node,
|
||||
*travel_direction_wrapper<direction_e::backwards>::second(node, next.first),
|
||||
*travel_direction_wrapper<direction_e::backwards>::first(node, next.first),
|
||||
@ -684,30 +664,26 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
|
||||
auto fmt_map = get_preferred_formats(p, lo);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Preferred formats:" << std::endl;
|
||||
for (auto& node_fmt : fmt_map) {
|
||||
if (node_fmt.second != format::any) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG_PASS << "Preferred formats:" << std::endl;
|
||||
for (auto& node_fmt : fmt_map) {
|
||||
if (node_fmt.second != format::any) {
|
||||
GPU_DEBUG_LOG_PASS << " " << node_fmt.first->id() << " " << fmt_to_str(node_fmt.second) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
propagate_formats(p, fmt_map, lo);
|
||||
minimize_local_reorders(p, fmt_map, lo);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Selected formats:" << std::endl;
|
||||
for (auto node_ptr : p.get_processing_order()) {
|
||||
if (fmt_map.count(node_ptr) == 0)
|
||||
continue;
|
||||
GPU_DEBUG_LOG_PASS << "Selected formats:" << std::endl;
|
||||
for (auto node_ptr : p.get_processing_order()) {
|
||||
if (fmt_map.count(node_ptr) == 0)
|
||||
continue;
|
||||
|
||||
auto fmt = fmt_map.at(node_ptr);
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
|
||||
}
|
||||
auto fmt = fmt_map.at(node_ptr);
|
||||
GPU_DEBUG_LOG_PASS << " " << node_ptr->id() << " " << fmt_to_str(fmt) << std::endl;
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
reorder_cnt total_reorder_count = std::accumulate(
|
||||
p.get_processing_order().begin(),
|
||||
p.get_processing_order().end(),
|
||||
@ -719,8 +695,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
return reorder_cnt{ total.number + count.number, total.total_sizes + count.total_sizes };
|
||||
});
|
||||
// Divide results by two as above function will each reorder from both sides
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
|
||||
GPU_DEBUG_LOG_PASS << "Total number of reorders: " << total_reorder_count.number / 2 << std::endl;
|
||||
GPU_DEBUG_LOG_PASS << "Total elements count of all reorders: " << total_reorder_count.total_sizes / 2 << std::endl;
|
||||
|
||||
// Count number of reorders that will be fused
|
||||
size_t nodes_with_fusing = 0;
|
||||
@ -736,8 +712,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
}
|
||||
}
|
||||
}
|
||||
GPU_DEBUG_COUT << "[clDNN][reorder_inputs] Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
|
||||
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||
GPU_DEBUG_LOG_PASS << "Number of nodes with fused reorders: " << nodes_with_fusing << std::endl;
|
||||
GPU_DEBUG_LOG_PASS << "----------------------------------------------" << std::endl;
|
||||
}
|
||||
|
||||
insert_reorders(p, fmt_map, rf, lo);
|
||||
@ -802,7 +778,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
}
|
||||
};
|
||||
|
||||
const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node<convolution>& conv_node) {
|
||||
const auto reorder_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
|
||||
{
|
||||
// reorder weights convolution
|
||||
auto& weights = conv_node.weights();
|
||||
|
@ -32,7 +32,6 @@ void select_preferred_formats::run(program& p) {
|
||||
return;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
for (auto n : p.get_processing_order()) {
|
||||
// Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
|
||||
try {
|
||||
@ -47,9 +46,7 @@ void select_preferred_formats::run(program& p) {
|
||||
|
||||
_lo.select_preferred_formats_for_onednn(*n, prim_desc);
|
||||
} catch(std::exception &exception) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
std::cout << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
|
||||
}
|
||||
}
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
@ -68,18 +68,15 @@ protected:
|
||||
dnnl::memory::desc desc = onednn::layout_to_memory_desc(a_zp->get_layout(), dnnl::memory::format_tag::a, true);
|
||||
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, a_zp->get_onednn_memory(desc)});
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
auto dnnl_mem = a_zp->get_onednn_memory(desc);
|
||||
void *mapped_ptr = dnnl_mem.map_data();
|
||||
if (mapped_ptr) {
|
||||
GPU_DEBUG_COUT << instance.id() << " activations_zero_points: ";
|
||||
for (size_t i = 0; i < desc.get_size(); ++i) {
|
||||
std::cout << static_cast<int32_t*>(mapped_ptr)[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
dnnl_mem.unmap_data(mapped_ptr);
|
||||
auto dnnl_mem = a_zp->get_onednn_memory(desc);
|
||||
void *mapped_ptr = dnnl_mem.map_data();
|
||||
if (mapped_ptr) {
|
||||
GPU_DEBUG_TRACE_DETAIL << instance.id() << " activations_zero_points: ";
|
||||
for (size_t i = 0; i < desc.get_size(); ++i) {
|
||||
GPU_DEBUG_TRACE_DETAIL << static_cast<int32_t*>(mapped_ptr)[i] << " ";
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << std::endl;
|
||||
dnnl_mem.unmap_data(mapped_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,18 +85,15 @@ protected:
|
||||
dnnl::memory::desc desc = onednn::layout_to_memory_desc(w_zp->get_layout(), dnnl::memory::format_tag::a, true);
|
||||
args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, w_zp->get_onednn_memory(desc)});
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
auto dnnl_mem = w_zp->get_onednn_memory(desc);
|
||||
void *mapped_ptr = dnnl_mem.map_data();
|
||||
if (mapped_ptr) {
|
||||
GPU_DEBUG_COUT << instance.id() << " weights_zero_points: ";
|
||||
for (size_t i = 0; i < desc.get_size(); ++i) {
|
||||
std::cout << static_cast<int32_t*>(mapped_ptr)[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
dnnl_mem.unmap_data(mapped_ptr);
|
||||
auto dnnl_mem = w_zp->get_onednn_memory(desc);
|
||||
void *mapped_ptr = dnnl_mem.map_data();
|
||||
if (mapped_ptr) {
|
||||
GPU_DEBUG_TRACE_DETAIL << instance.id() << " weights_zero_points: ";
|
||||
for (size_t i = 0; i < desc.get_size(); ++i) {
|
||||
GPU_DEBUG_TRACE_DETAIL << static_cast<int32_t*>(mapped_ptr)[i] << " ";
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << std::endl;
|
||||
dnnl_mem.unmap_data(mapped_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,6 +23,8 @@
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#define GPU_DEBUG_LOG_PASS GPU_DEBUG_LOG << "[" << get_name() << "] "
|
||||
|
||||
namespace cldnn {
|
||||
class base_pass {
|
||||
friend class pass_manager;
|
||||
|
@ -1350,8 +1350,8 @@ impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node)
|
||||
preferred_type = impl_types::cpu;
|
||||
|
||||
if (node.id() == forced_impl_type.substr(0, found_type)) {
|
||||
GPU_DEBUG_COUT << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
|
||||
<< forced_impl_type.substr(found_type + 1) << std::endl;
|
||||
GPU_DEBUG_LOG << " Forced implementation type : " << forced_impl_type.substr(0, found_type) << " : "
|
||||
<< forced_impl_type.substr(found_type + 1) << std::endl;
|
||||
return preferred_type;
|
||||
}
|
||||
}
|
||||
@ -1819,7 +1819,6 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
if (node.is_input() || !are_data_types_suitable_for_onednn(node)) {
|
||||
return;
|
||||
}
|
||||
@ -1868,10 +1867,8 @@ void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, d
|
||||
node.set_preferred_output_fmt(usr, dst_fmt);
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
std::cout << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
|
||||
<< " For index : " << idx << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
|
||||
<< " For index : " << idx << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -890,9 +890,8 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "NetworkImpl::Execute");
|
||||
// Wait for previous execution completion
|
||||
reset_execution(false);
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1)
|
||||
GPU_DEBUG_COUT << "----------------------------------------------" << std::endl;
|
||||
GPU_DEBUG_TRACE << "----------------------------------------------" << std::endl;
|
||||
GPU_DEBUG_TRACE << "Start network execution" << std::endl;
|
||||
|
||||
std::vector<memory::ptr> in_out_mem;
|
||||
auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
|
||||
@ -928,6 +927,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
|
||||
auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());
|
||||
|
||||
set_arguments();
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
for (auto& inst : _exec_order) {
|
||||
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
|
||||
const std::string layer_name = inst->id();
|
||||
@ -1120,10 +1120,7 @@ void network::allocate_primitive_instance(program_node const& node) {
|
||||
if (_primitives.count(node.id()))
|
||||
return;
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << node.id() << ": allocate primitive instance" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << node.id() << ": allocate primitive instance" << std::endl;
|
||||
|
||||
auto inst = node.type()->create_instance(*this, node);
|
||||
|
||||
@ -1191,10 +1188,7 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
|
||||
// Allocate and transfer memory
|
||||
auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
|
||||
device_mem->copy_from(get_stream(), inst_mem);
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << node.id() << ": constant]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << node.id() << ": constant]" << std::endl;
|
||||
_memory_pool->release_memory(&inst_mem, node.id(), get_id());
|
||||
instance->set_output_memory(device_mem);
|
||||
}
|
||||
|
@ -48,11 +48,13 @@ void pass_manager::run(program& p, base_pass& pass) {
|
||||
using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
|
||||
using Time = std::chrono::high_resolution_clock;
|
||||
|
||||
GPU_DEBUG_LOG << "Run pass " << pass.get_name() << std::endl;
|
||||
auto start = Time::now();
|
||||
pass.run(p);
|
||||
auto stop = Time::now();
|
||||
std::chrono::duration<float> fs = stop - start;
|
||||
ms opt_pass_time = std::chrono::duration_cast<ms>(fs);
|
||||
GPU_DEBUG_LOG << "Pass " << pass.get_name() << " execution time: " << opt_pass_time.count() << " ms" << std::endl;
|
||||
|
||||
p.save_pass_info(pass.get_name());
|
||||
|
||||
|
@ -142,7 +142,6 @@ void primitive_inst::set_output_memory(memory::ptr mem_new, bool check, size_t i
|
||||
}
|
||||
|
||||
void primitive_inst::update_shape() {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::shape_inference);
|
||||
|
||||
bool input_shape_changed = false;
|
||||
@ -199,9 +198,7 @@ void primitive_inst::update_shape() {
|
||||
// Events may be not created for in-order queue, so take them for OOO queue only
|
||||
if (_network.has_event(dep.id()) && queue_type == queue_types::out_of_order) {
|
||||
dependencies_events.push_back(_network.get_primitive_event(dep_id));
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": shape infer waits for " << i << " dependency\n";
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": shape infer waits for " << i << " dependency\n";
|
||||
}
|
||||
auto dep_mem = _network.get_output_memory(dep_id);
|
||||
memory_deps.insert({i, dep_mem});
|
||||
@ -221,10 +218,8 @@ void primitive_inst::update_shape() {
|
||||
auto update_output_layout = [&](layout& layout, size_t idx) {
|
||||
layout.data_padding = padding::max(_node->get_primitive()->output_paddings[idx], layout.data_padding);
|
||||
if (_impl_params->get_output_layout(idx) != layout) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
|
||||
<< " now: " << layout.to_short_string() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
|
||||
<< " now: " << layout.to_short_string() << std::endl;
|
||||
set_shape_change();
|
||||
}
|
||||
_impl_params->output_layouts[idx] = layout;
|
||||
@ -265,16 +260,12 @@ void primitive_inst::realloc_if_needed() {
|
||||
bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;
|
||||
|
||||
if (can_reuse_buffer) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reuse previously allocated output buffer" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
|
||||
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
|
||||
} else {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": realloc output memory. "
|
||||
<< " Current buffer_size=" << max_output_layout_size
|
||||
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
|
||||
<< " Current buffer_size=" << max_output_layout_size
|
||||
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
|
||||
_outputs = allocate_outputs(&updated_params);
|
||||
// TODO : need to handle multiple outputs
|
||||
max_output_layout_size = updated_params.output_layouts[0].count();
|
||||
@ -285,7 +276,6 @@ void primitive_inst::realloc_if_needed() {
|
||||
|
||||
void primitive_inst::update_impl() {
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::update_implementation);
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
auto prev_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
auto extend_to_6d = [this](ov::PartialShape ps) -> std::vector<size_t> {
|
||||
// For shape_of we extend shape with 1-s to 6d rank to make kernel simpler
|
||||
@ -321,7 +311,7 @@ void primitive_inst::update_impl() {
|
||||
return seed;
|
||||
};
|
||||
|
||||
auto update_shape_info = [this, extend_to_6d, debug_config, prev_impl_str](const kernel_impl_params& params) {
|
||||
auto update_shape_info = [this, extend_to_6d, prev_impl_str](const kernel_impl_params& params) {
|
||||
mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < _node->get_dependencies().size(); i++) {
|
||||
@ -337,17 +327,14 @@ void primitive_inst::update_impl() {
|
||||
for (size_t j = 0; j < output_shape.size(); j++)
|
||||
lock[offset++] = static_cast<int32_t>(output_shape[j]);
|
||||
}
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
std::stringstream s;
|
||||
s << "shapes: ";
|
||||
for (size_t i = 0; i < offset; i++)
|
||||
s << lock[i] << " ";
|
||||
GPU_DEBUG_COUT << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl;
|
||||
}
|
||||
std::stringstream s;
|
||||
s << "shapes: ";
|
||||
for (size_t i = 0; i < offset; i++)
|
||||
s << lock[i] << " ";
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": update dynamic impl " << prev_impl_str << " to new shape: " << s.str() << std::endl;
|
||||
};
|
||||
|
||||
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
// Update param if fake_alignment is available
|
||||
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||
auto layout_key = get_layout_key(updated_params);
|
||||
@ -359,10 +346,7 @@ void primitive_inst::update_impl() {
|
||||
if (has_cached_impl) {
|
||||
_impl = cache.get(layout_key)->clone();
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
|
||||
}
|
||||
}
|
||||
if (!has_cached_impl) {
|
||||
@ -402,10 +386,9 @@ void primitive_inst::update_impl() {
|
||||
kernels_cache.reset();
|
||||
std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
|
||||
cache.add(layout_key, _impl->clone());
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
GPU_DEBUG_COUT << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
|
||||
}
|
||||
|
||||
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -436,14 +419,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
subgraph->set_input_data(d.first->id(), actual_mem);
|
||||
}
|
||||
}
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << "[Start] Executing unfused subgraph of " << id() << std::endl;
|
||||
}
|
||||
|
||||
GPU_DEBUG_TRACE_DETAIL << "[Start] Executing unfused subgraph of " << id() << std::endl;
|
||||
auto outputs = subgraph->execute(events);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << "[End] Finished executing unfused subgraph of " << id() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << "[End] Finished executing unfused subgraph of " << id() << std::endl;
|
||||
|
||||
auto last_fd = _impl_params->fused_desc.back();
|
||||
auto last_prim_id = last_fd.desc->id;
|
||||
@ -476,28 +454,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
}
|
||||
on_execute();
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
std::ostringstream in_addr;
|
||||
// buffer_ptr() only support usm_memory
|
||||
for (size_t i = 0; i < this->dependencies().size(); i++) {
|
||||
auto in_mem = dep_memory_ptr(i);
|
||||
if (in_mem) {
|
||||
in_addr << in_mem->buffer_ptr();
|
||||
if (i < this->dependencies().size() - 1) {
|
||||
in_addr << ", ";
|
||||
}
|
||||
}
|
||||
}
|
||||
auto out_mem = output_memory_ptr();
|
||||
auto out_alloc_type = out_mem ? out_mem->get_allocation_type() : allocation_type::unknown;
|
||||
auto out_ptr = out_mem ? out_mem->buffer_ptr() : nullptr;
|
||||
auto impl_name = _impl->get_kernel_name();
|
||||
|
||||
GPU_DEBUG_COUT << id() << ": execute " << impl_name << ". Memory type: "
|
||||
<< out_alloc_type << ", in_usm("
|
||||
<< in_addr.str() << "), out_usm("
|
||||
<< out_ptr << ")" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE << id() << ": execute " << _impl->get_kernel_name() << std::endl;
|
||||
|
||||
if (_exec_deps.empty() && dependencies.empty()) {
|
||||
dependencies = events;
|
||||
@ -695,10 +652,7 @@ void primitive_inst::allocate_internal_buffers(void) {
|
||||
// allocate intermediate memory for the updated layout of buffer
|
||||
std::vector<memory::cptr> intermediates_memory;
|
||||
for (auto layout : ibuf_layouts) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << _node->id() << ": internal buf]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << _node->id() << ": internal buf]" << std::endl;
|
||||
auto alloc_type = allocation_type::unknown;
|
||||
if (input_device_mem && (available_device_mem_size - (int64_t)layout.bytes_count() >= 0)) {
|
||||
alloc_type = engine.get_preferred_memory_allocation_type();
|
||||
@ -719,8 +673,6 @@ event::ptr primitive_inst::update_weights() {
|
||||
if (!weightable_node)
|
||||
return nullptr;
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
auto& weights_params = _impl->_weights_reorder_params;
|
||||
bool requires_reorder = weights_params.engine != kernel_selector::GenericKernelParams::Engine::NONE &&
|
||||
(!_impl_params->reordered_weights || _impl_params->reordered_weights->get_layout() != from_weights_tensor(weights_params.dest));
|
||||
@ -744,17 +696,13 @@ event::ptr primitive_inst::update_weights() {
|
||||
if (layout_key != "") {
|
||||
auto& cache = get_network().get_in_mem_kernels_cache();
|
||||
if (cache.has(layout_key)) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
|
||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
|
||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
kernel = cache.get(layout_key);
|
||||
} else {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout.to_short_string()
|
||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
|
||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||
auto& kernels_cache = get_network().get_kernels_cache();
|
||||
auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
|
||||
kernels_cache.compile();
|
||||
@ -768,9 +716,7 @@ event::ptr primitive_inst::update_weights() {
|
||||
|
||||
bool can_reuse = _impl_params->reordered_weights != nullptr && _impl_params->reordered_weights->size() <= expected_layout.bytes_count();
|
||||
if (can_reuse) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reuse weights memory" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse weights memory" << std::endl;
|
||||
_impl_params->reordered_weights = engine.reinterpret_buffer(*_impl_params->reordered_weights, expected_layout);
|
||||
} else {
|
||||
auto alloc_type = engine.get_preferred_memory_allocation_type();
|
||||
@ -783,6 +729,7 @@ event::ptr primitive_inst::update_weights() {
|
||||
stream.set_arguments(*kernel, weights_params.clKernel->params, args);
|
||||
auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true);
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
|
||||
stream.wait_for_events({ev});
|
||||
}
|
||||
@ -851,15 +798,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() : false;
|
||||
auto use_lockable_memory = is_output_buffer(_node) || is_cpu || is_any_user_cpu(_node.get_users()) ||
|
||||
!_engine.supports_allocation(allocation_type::usm_device);
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
|
||||
const auto& alloc_type = use_lockable_memory ? lockable_mem_type
|
||||
: usm_device_allocatable ? allocation_type::usm_device : lockable_mem_type;
|
||||
|
||||
if ((is_internal && (_node.can_be_optimized() || _node.is_type<generic_layer>())) || (memory_reuse_by_user == false)) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
|
||||
return get_memory_from_pool(_engine,
|
||||
layout,
|
||||
_node.id(),
|
||||
@ -868,21 +812,15 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
false);
|
||||
} else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
|
||||
_engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
|
||||
return _engine.allocate_memory(layout, allocation_type::usm_device, false);
|
||||
} else if (is_internal && !_node.is_output() && _node.is_type<input_layout>()) {
|
||||
// Skip memory reset for input_layout primitives, since data will be copied from cldnn::data primitive
|
||||
// or just reuse primitive's memory
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << _node.id() << ": constant]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
|
||||
return _engine.allocate_memory(layout, alloc_type, false);
|
||||
} else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << _node.id() << ": output]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
|
||||
return _engine.allocate_memory(layout, alloc_type);
|
||||
} else {
|
||||
return get_memory_from_pool(_engine,
|
||||
@ -940,10 +878,7 @@ std::string primitive_inst::generic_to_string(program_node const& node, const ch
|
||||
}
|
||||
|
||||
cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": Use unfused subgraph due to unexpected fusions\n";
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": Use unfused subgraph due to unexpected fusions\n";
|
||||
if (!_unfused_subgraph) {
|
||||
topology t;
|
||||
|
||||
|
@ -719,17 +719,12 @@ void program::transfer_memory_to_device() {
|
||||
|
||||
|
||||
if (alloc_type == allocation_type::usm_host || alloc_type == allocation_type::usm_shared) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
|
||||
// Allocate and transfer memory
|
||||
auto device_mem = mem.get_engine()->allocate_memory(data_node_layout, allocation_type::usm_device, false);
|
||||
device_mem->copy_from(get_stream(), mem);
|
||||
data_node.attach_memory(device_mem);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << data_node.id() << ": constant]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << data_node.id() << ": constant]" << std::endl;
|
||||
const_cast<memory::ptr&>(data_node.get_primitive()->mem).reset();
|
||||
// TODO: Do we need finish call here? Maybe call it in network::execute() ?
|
||||
get_stream().finish();
|
||||
@ -1656,7 +1651,6 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
|
||||
});
|
||||
auto& engine = get_engine();
|
||||
int64_t host_alloc = 0;
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
// just to prevent the memories from being freed during allocation
|
||||
std::unordered_set<memory::ptr> allocated_mem_ptrs;
|
||||
for (const auto& node : nodes_to_allocate) {
|
||||
@ -1676,10 +1670,8 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
|
||||
if (engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu)
|
||||
total_host_alloc_size += engine.get_used_device_memory(allocation_type::usm_device);
|
||||
if ((cur_vmem != -1 && total_host_alloc_size > cur_vmem * 0.5) || (total_host_alloc_size >= max_global_mem_size)) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory ("
|
||||
<< cur_vmem << ")" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "Estimated host mem usage calculated with default base batch size(16) exceeds the available memory ("
|
||||
<< cur_vmem << ")" << std::endl;
|
||||
return {-1L, -1L};
|
||||
}
|
||||
#endif
|
||||
|
@ -246,23 +246,18 @@ bool program_node::is_detached(bool whole_branch) {
|
||||
}
|
||||
|
||||
layout program_node::calc_output_layout() const {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
bool allow_new_shape_infer =
|
||||
get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
|
||||
if (allow_new_shape_infer) {
|
||||
auto out_layouts = type()->calc_output_layouts(*this, *get_kernel_impl_params());
|
||||
if (!out_layouts.empty()) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout(new):" << out_layouts[0] << std::endl;
|
||||
return out_layouts[0];
|
||||
}
|
||||
}
|
||||
|
||||
auto res = type()->calc_output_layout(*this, *get_kernel_impl_params());
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": calc_output_layout:" << res << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": calc_output_layout:" << res << std::endl;
|
||||
|
||||
return res;
|
||||
}
|
||||
@ -528,8 +523,6 @@ bool program_node::has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& a
|
||||
|
||||
dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr,
|
||||
bool& optimization_is_completed) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
// Create new dnnl::post_ops object which will be filled inside the optimization process
|
||||
dnnl::post_ops optimized_p_ops;
|
||||
|
||||
@ -673,22 +666,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
int64_t prev_post_op_idx = 0;
|
||||
bool optimization_done = false;
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << "================================================" << std::endl;
|
||||
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE << "================================================" << std::endl;
|
||||
GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
|
||||
remove_optimized_prefix(cur_post_ops);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl;
|
||||
GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE << "remove optimized prefix ------------------------" << std::endl;
|
||||
GPU_DEBUG_TRACE << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
GPU_DEBUG_TRACE << "----------------------------------->>>>>>>>>>>>>" << std::endl;
|
||||
|
||||
// Get post-ops size for current node
|
||||
int64_t post_ops_size = cur_post_ops.size();
|
||||
@ -710,8 +699,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
auto cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
||||
auto prev_type = cur_post_ops[prev_post_op_idx].op_type;
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3)
|
||||
GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
|
||||
GPU_DEBUG_TRACE << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
|
||||
|
||||
// Ignore optimized operations for "previous" operation in our operation pair
|
||||
while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) {
|
||||
@ -728,8 +716,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
cur_type = cur_post_ops[cur_post_op_idx].op_type;
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3)
|
||||
GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
|
||||
GPU_DEBUG_TRACE << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
|
||||
|
||||
auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
|
||||
auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);
|
||||
@ -766,10 +753,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
|
||||
bool cur_ops_pair_is_optimized = false;
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type
|
||||
<< ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE << "prev_idx: " << prev_idx << " " << prev_type
|
||||
<< ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
|
||||
|
||||
if (can_try_optimize) {
|
||||
if (eltw_and_eltw) {
|
||||
@ -1005,12 +990,10 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
|
||||
remove_optimized_prefix(cur_post_ops);
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_COUT << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
GPU_DEBUG_COUT << "------------------------------------------------" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_TRACE << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
|
||||
for (size_t i = 0; i < cur_post_ops.size(); i++)
|
||||
GPU_DEBUG_TRACE << " " << i << ": " << cur_post_ops[i].op_type << std::endl;
|
||||
GPU_DEBUG_TRACE << "------------------------------------------------" << std::endl;
|
||||
|
||||
add_onednn_fused_primitives(cur_post_ops);
|
||||
|
||||
|
@ -107,12 +107,8 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
|
||||
}
|
||||
} catch (std::runtime_error& ex) {
|
||||
// we have to handle it in order to avoid exception in KernelSelector as much we can
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
|
||||
<< kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -202,12 +198,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
|
||||
}
|
||||
} catch (std::runtime_error& ex) {
|
||||
// we have to handle it in order to avoid exception in KernelSelector as much we can
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
|
||||
<< kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -231,12 +223,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
|
||||
}
|
||||
} catch (std::runtime_error& ex) {
|
||||
// we have to handle it in order to avoid exception in KernelSelector as much we can
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 3) {
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_COUT << "layerID: " << params.layerID << " kenrel: "
|
||||
<< kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -493,7 +493,7 @@ void InferRequest::enqueue() {
|
||||
// If dump layers path is set, only runs first inference.
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
|
||||
GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
|
||||
GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
@ -760,10 +760,7 @@ void InferRequest::allocate_inputs() {
|
||||
OPENVINO_ASSERT(litr != inputLayouts.end(), "[GPU] Input layout for ", name, " is not found");
|
||||
const auto input_layout = litr->second;
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl;
|
||||
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
|
||||
TensorDesc desc_fp32 = desc;
|
||||
desc_fp32.setPrecision(Precision::FP32);
|
||||
@ -796,10 +793,7 @@ void InferRequest::allocate_outputs() {
|
||||
if (output_layout.is_static())
|
||||
desc.setDims(m_graph->GetOutputSize(no.first));
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl;
|
||||
|
||||
outputsMap[no.first] = outputID;
|
||||
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 ||
|
||||
|
@ -769,7 +769,7 @@ void InferRequestLegacy::enqueue() {
|
||||
// If dump layers path is set, only runs first inference.
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
|
||||
GPU_DEBUG_COUT << "Only run first inference to dump layers." << std::endl;
|
||||
GPU_DEBUG_INFO << "Only run first inference to dump layers." << std::endl;
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
@ -1031,10 +1031,7 @@ void InferRequestLegacy::allocate_inputs() {
|
||||
IE_THROW() << "Input layout for " << name << " is not found";
|
||||
}
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << name << ": input blob]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << name << ": input blob]" << std::endl;
|
||||
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16) {
|
||||
TensorDesc desc_fp32 = desc;
|
||||
desc_fp32.setPrecision(Precision::FP32);
|
||||
@ -1088,10 +1085,7 @@ void InferRequestLegacy::allocate_outputs() {
|
||||
// Can be removed once 76176 is resolved.
|
||||
desc.setDims(m_graph->GetOutputSize(no.first));
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << no.first << ": output blob]" << std::endl;
|
||||
|
||||
outputsMap[no.first] = outputID;
|
||||
if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 ||
|
||||
|
@ -201,10 +201,7 @@ void createClDnnConstant(Program& p, const ngraph::Shape& constDims, const std::
|
||||
p.primitive_ids[initialconstPrimID] = constPrimID;
|
||||
p.profiling_ids.push_back(initialconstPrimID);
|
||||
} else {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << initialconstPrimID << ": constant]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl;
|
||||
cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false);
|
||||
auto& stream = p.GetEngine().get_program_stream();
|
||||
cldnn::mem_lock<char> lock{mem, stream};
|
||||
|
@ -73,10 +73,7 @@ static void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr<ngr
|
||||
cldnn::format::get_default_format(op->get_output_shape(1).size()),
|
||||
tensor_from_dims(op->get_output_shape(1)));
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout));
|
||||
|
||||
cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";
|
||||
|
@ -93,7 +93,6 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
|
||||
auto outputIndices = op->get_output_partial_shape(0)[0].get_length();
|
||||
|
||||
std::vector<cldnn::memory::ptr> shared_memory;
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
switch (num_outputs) {
|
||||
case 3: {
|
||||
auto mutable_precision_second = op->get_output_element_type(2);
|
||||
@ -105,9 +104,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
|
||||
cldnn::format::get_default_format(op->get_output_shape(2).size()),
|
||||
tensor_from_dims(op->get_output_shape(2)));
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));
|
||||
|
||||
cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
|
||||
@ -123,9 +120,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
|
||||
cldnn::format::bfyx,
|
||||
cldnn::tensor(static_cast<int32_t>(outputIndices), 3, 1, 1));
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));
|
||||
|
||||
cldnn::primitive_id non_max_supression_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";
|
||||
|
@ -64,10 +64,7 @@ static void CreateProposalOp(Program& p, const std::shared_ptr<ngraph::op::v0::P
|
||||
cldnn::format::get_default_format(op->get_output_shape(1).size()),
|
||||
tensor_from_dims(op->get_output_shape(1)));
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
|
||||
|
||||
cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write";
|
||||
|
@ -65,10 +65,7 @@ static void CreateTopKOp(Program& p, const std::shared_ptr<ngraph::op::v1::TopK>
|
||||
cldnn::format::get_default_format(op->get_output_shape(1).size()),
|
||||
tensor_from_dims(op->get_output_shape(1)));
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
|
||||
auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
|
||||
|
||||
cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write";
|
||||
|
@ -686,25 +686,20 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
auto closest_pow_of_2 = [] (float x) {
|
||||
return pow(2, floor(std::log(x)/std::log(2)));
|
||||
};
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
auto model_param = options.find(ov::hint::model.name());
|
||||
if (model_param == options.end()) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "[OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
|
||||
return decltype(ov::optimal_batch_size)::value_type {static_cast<unsigned int>(1)};
|
||||
}
|
||||
std::shared_ptr<ngraph::Function> model;
|
||||
try {
|
||||
model = model_param->second.as<std::shared_ptr<ngraph::Function>>();
|
||||
} catch (...) {
|
||||
IE_THROW() << "[GPU_OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr<ov::Model> type";
|
||||
}
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "DEVICE_INFO:"
|
||||
<< "gfx_version.major, " << device_info.gfx_ver.major
|
||||
<< "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
|
||||
IE_THROW() << "[OPTIMAL_BATCH_SIZE] ov::hint::model should be std::shared_ptr<ov::Model> type";
|
||||
}
|
||||
GPU_DEBUG_INFO << "DEVICE_INFO:"
|
||||
<< "gfx_version.major, " << device_info.gfx_ver.major
|
||||
<< "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
|
||||
static std::map<cldnn::gfx_version, size_t> gen_kbytes_per_bank = {
|
||||
{{12, 0, 0}, 480}, // TGL
|
||||
{{12, 1, 0}, 2048}, // DG1
|
||||
@ -722,14 +717,12 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
? next_pow_of_2(device_info.num_sub_slices_per_slice)
|
||||
: 2 * device_info.num_sub_slices_per_slice;
|
||||
L3_cache_size = kbytes_per_bank * 1024 * num_banks_per_slice * device_info.num_slices;
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "DEVICE_INFO:"
|
||||
<< "num_slices " << device_info.num_slices
|
||||
<< ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
|
||||
<< ", num_banks_per_slice " << num_banks_per_slice
|
||||
<< ", gen_kbytes_per_bank : " << kbytes_per_bank
|
||||
<< ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "DEVICE_INFO:"
|
||||
<< "num_slices " << device_info.num_slices
|
||||
<< ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
|
||||
<< ", num_banks_per_slice " << num_banks_per_slice
|
||||
<< ", gen_kbytes_per_bank : " << kbytes_per_bank
|
||||
<< ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
|
||||
}
|
||||
Config config = _impl->m_configs.GetConfig(device_id);
|
||||
auto networkCloned = CloneAndTransformNetwork(CNNNetwork(model), config);
|
||||
@ -744,11 +737,9 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
unsigned int closest = closest_pow_of_2(max_batch_size);
|
||||
batch = std::min(closest, batch);
|
||||
batch = std::min(256u, batch); //batch 256 is a max
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << memPressure.max_mem_tolerance << std::endl;
|
||||
GPU_DEBUG_COUT << "MAX_BATCH: " << max_batch_size << std::endl;
|
||||
GPU_DEBUG_COUT << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << memPressure.max_mem_tolerance << std::endl;
|
||||
GPU_DEBUG_INFO << "MAX_BATCH: " << max_batch_size << std::endl;
|
||||
GPU_DEBUG_INFO << "ACTUAL OPTIMAL BATCH: " << batch << std::endl;
|
||||
return decltype(ov::optimal_batch_size)::value_type {batch};
|
||||
} else if (name == ov::device::uuid) {
|
||||
ov::device::UUID uuid = {};
|
||||
@ -815,17 +806,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
}
|
||||
|
||||
int64_t available_device_mem = device_info.max_global_mem_size - occupied_device_mem;
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
|
||||
<< " (occupied: " << occupied_device_mem << ")" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is " << available_device_mem
|
||||
<< " (occupied: " << occupied_device_mem << ")" << std::endl;
|
||||
|
||||
int64_t max_batch_size = 1;
|
||||
|
||||
if (options.find(ov::hint::model.name()) == options.end()) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] MODELS_PTR is not set: return 1" << std::endl;
|
||||
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
|
||||
}
|
||||
|
||||
@ -849,17 +836,13 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
}
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] n_streams : " << n_streams << std::endl;
|
||||
|
||||
auto available_device_mem_it = options.find(ov::intel_gpu::hint::available_device_mem.name());
|
||||
if (available_device_mem_it != options.end()) {
|
||||
if (available_device_mem_it->second.is<int64_t>()) {
|
||||
available_device_mem = std::min(static_cast<int64_t>(available_device_mem), available_device_mem_it->second.as<int64_t>());
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[GPU_MAX_BATCH_SIZE] available memory is reset by user " << available_device_mem << std::endl;
|
||||
} else {
|
||||
IE_THROW() << "[GPU_MAX_BATCH_SIZE] bad casting: ov::intel_gpu::hint::available_device_mem should be int64_t type";
|
||||
}
|
||||
@ -910,9 +893,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
const auto& shape = input->get_partial_shape();
|
||||
// currently no plugin support batched execution for dynamic networks
|
||||
if (shape.is_dynamic()) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] does not support dynamic networks" << std::endl;
|
||||
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
|
||||
}
|
||||
|
||||
@ -921,10 +902,8 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
if (ov::DimensionTracker::get_label(shape[s])) {
|
||||
// batched dim for the input
|
||||
auto batched_input_id = ngraph::op::util::get_ie_output_name(params[input_id]->output(0));
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id
|
||||
<< "[" << s << "]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] detected batched input " << batched_input_id
|
||||
<< "[" << s << "]" << std::endl;
|
||||
batched_inputs.insert(std::make_pair(batched_input_id, s));
|
||||
}
|
||||
}
|
||||
@ -932,9 +911,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
}
|
||||
|
||||
if (!batched_inputs.size()) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[MAX_BATCH_SIZE] MAX_BATCH_SIZE supports only networks with inputs/outputs featuring batched dim." << std::endl;
|
||||
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
|
||||
}
|
||||
|
||||
@ -944,9 +921,7 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
shapes[input.first][input.second] = base_batch_size;
|
||||
cloned_network.reshape(shapes);
|
||||
} catch (...) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "[MAX_BATCH_SIZE] Error at reshape to " << base_batch_size << std::endl;
|
||||
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
|
||||
}
|
||||
|
||||
@ -962,15 +937,11 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
static_cast<int64_t>(static_cast<int64_t>(available_device_mem) - device_memory_usage.first));
|
||||
int64_t mem_per_batch = std::max(static_cast<int64_t>(1L), (device_memory_usage.second / static_cast<int64_t>(base_batch_size)));
|
||||
max_batch_size = mem_for_general / (mem_per_batch * static_cast<int64_t>(n_streams));
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size << std::endl;
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first << std::endl;
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Base batch size: " << base_batch_size << std::endl;
|
||||
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Const mem usage: " << device_memory_usage.first << std::endl;
|
||||
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] General mem usage: " << device_memory_usage.second << std::endl;
|
||||
} catch (std::exception& e) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "[GPU_MAX_BATCH_SIZE] Failed in reshape or build program " << e.what() << std::endl;
|
||||
}
|
||||
return decltype(ov::max_batch_size)::value_type {static_cast<uint32_t>(max_batch_size)};
|
||||
} else if (isModelCachingEnabled && name == METRIC_KEY(IMPORT_EXPORT_SUPPORT)) {
|
||||
|
@ -375,11 +375,8 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st
|
||||
|
||||
void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ngraph::Node>& op) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::CreateSingleLayerPrimitive");
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
|
||||
<< "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Process " << "op::v" << op->get_type_info().version << "::" << op->get_type_name() << " operation "
|
||||
<< "(friendly_name=" << op->get_friendly_name() << ")" << std::endl;
|
||||
|
||||
bool is_created = false;
|
||||
const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
|
||||
|
@ -17,6 +17,8 @@ const char *debug_configuration::prefix = "GPU_Debug: ";
|
||||
|
||||
#ifdef GPU_DEBUG_CONFIG
|
||||
|
||||
#define GPU_DEBUG_COUT std::cout << cldnn::debug_configuration::prefix
|
||||
|
||||
template<typename T>
|
||||
void print_option(std::string option_name, T option_value) {
|
||||
GPU_DEBUG_COUT << "Config " << option_name << " = " << option_value << std::endl;
|
||||
|
@ -265,11 +265,8 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
|
||||
default:
|
||||
throw std::runtime_error("Invalid engine type");
|
||||
}
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
const auto& info = device->get_info();
|
||||
GPU_DEBUG_COUT << "Selected Device: " << info.dev_name << std::endl;
|
||||
}
|
||||
const auto& info = device->get_info();
|
||||
GPU_DEBUG_INFO << "Selected Device: " << info.dev_name << std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -299,12 +299,9 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
|
||||
dump_file << "*/\n";
|
||||
}
|
||||
if (!err_log.empty()) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose) {
|
||||
std::cout << "-------- OpenCL build error" << std::endl;
|
||||
std::cout << err_log << std::endl;
|
||||
std::cout << "-------- End of OpenCL build error" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_INFO << "-------- OpenCL build error" << std::endl;
|
||||
GPU_DEBUG_INFO << err_log << std::endl;
|
||||
GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl;
|
||||
std::stringstream err_ss(err_log);
|
||||
std::string line;
|
||||
int cnt = 0;
|
||||
|
@ -21,12 +21,9 @@ memory::memory(engine* engine, const layout& layout, allocation_type type, bool
|
||||
: _engine(engine), _layout(layout), _bytes_count(_layout.bytes_count()), _type(type), _reused(reused) {
|
||||
if (!_reused && _engine) {
|
||||
_engine->add_memory_used(_bytes_count, type);
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
|
||||
<< " (current=" << _engine->get_used_device_memory(type) << ";"
|
||||
<< " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Allocate " << _bytes_count << " bytes of " << type << " allocation type"
|
||||
<< " (current=" << _engine->get_used_device_memory(type) << ";"
|
||||
<< " max=" << _engine->get_max_used_device_memory(type) << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -35,12 +32,9 @@ memory::~memory() {
|
||||
try {
|
||||
_engine->subtract_memory_used(_bytes_count, _type);
|
||||
} catch (...) {}
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
|
||||
<< " (current=" << _engine->get_used_device_memory(_type) << ";"
|
||||
<< " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Free " << _bytes_count << " bytes of " << _type << " allocation type"
|
||||
<< " (current=" << _engine->get_used_device_memory(_type) << ";"
|
||||
<< " max=" << _engine->get_max_used_device_memory(_type) << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -137,10 +137,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
|
||||
++it;
|
||||
}
|
||||
}
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
|
||||
// didn't find anything for you? create new resource
|
||||
auto mem = alloc_memory(layout, type);
|
||||
{
|
||||
@ -179,10 +176,7 @@ memory::ptr memory_pool::get_from_padded_pool(const layout& layout,
|
||||
memory_record({{id, network_id}}, mem, network_id, type));
|
||||
return mem;
|
||||
}
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "[" << id << ": output]" << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
|
||||
auto mem = alloc_memory(layout, type);
|
||||
std::list<memory_record> list = {memory_record({{id, network_id}}, mem, network_id, type)};
|
||||
_padded_pool.emplace(layout, std::move(list));
|
||||
|
@ -76,10 +76,7 @@ std::vector<cl_queue_properties> command_queues_builder::get_properties(const cl
|
||||
using cmp_t = std::common_type<decltype(queue_properties), typename std::underlying_type<cl::QueueProperties>::type>::type;
|
||||
if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
|
||||
out_of_order = false;
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
GPU_DEBUG_COUT << "Requested out-of-order queue is not supported by current device. Use in-order instead";
|
||||
}
|
||||
GPU_DEBUG_INFO << "Requested out-of-order queue is not supported by current device. Use in-order instead\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -180,10 +180,10 @@ bool ocl_events::get_profiling_info_impl(std::list<instrumentation::profiling_in
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->print_multi_kernel_perf) {
|
||||
if (period.stage == instrumentation::profiling_stage::executing) {
|
||||
GPU_DEBUG_COUT << "Multi-kernel time: ";
|
||||
GPU_DEBUG_TRACE << "Multi-kernel time: ";
|
||||
for (auto& duration : all_durations[period.stage])
|
||||
std::cout << " " << (duration.second - duration.first) / 1000;
|
||||
std::cout << " Total " << sum / 1000 << std::endl;
|
||||
GPU_DEBUG_TRACE << " " << (duration.second - duration.first) / 1000;
|
||||
GPU_DEBUG_TRACE << " Total " << sum / 1000 << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -338,10 +338,7 @@ void* gpu_usm::lock(const stream& stream, mem_lock_type type) {
|
||||
if (type != mem_lock_type::read) {
|
||||
throw std::runtime_error("Unable to lock allocation_type::usm_device with write lock_type.");
|
||||
}
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << "Copy usm_device buffer to host buffer." << std::endl;
|
||||
}
|
||||
GPU_DEBUG_LOG << "Copy usm_device buffer to host buffer." << std::endl;
|
||||
_host_buffer.allocateHost(_bytes_count);
|
||||
cl_stream.get_usm_helper().enqueue_memcpy(cl_stream.get_cl_queue(), _host_buffer.get(), _buffer.get(), _bytes_count, CL_TRUE);
|
||||
_mapped_ptr = _host_buffer.get();
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "ocl_event.hpp"
|
||||
#include "ocl_user_event.hpp"
|
||||
#include "ocl_command_queues_builder.hpp"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "ocl_kernel.hpp"
|
||||
#include "ocl_common.hpp"
|
||||
|
||||
@ -49,6 +50,27 @@ inline cl::NDRange toNDRange(const std::vector<size_t>& v) {
|
||||
}
|
||||
}
|
||||
|
||||
cl_int set_kernel_arg(ocl_kernel_type& kernel, uint32_t idx, cldnn::memory::cptr mem) {
|
||||
if (!mem)
|
||||
return CL_INVALID_ARG_VALUE;
|
||||
|
||||
if (mem->get_layout().format.is_image_2d()) {
|
||||
auto buf = std::dynamic_pointer_cast<const ocl::gpu_image2d>(mem)->get_buffer();
|
||||
GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (image) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
|
||||
return kernel.setArg(idx, buf);
|
||||
} else if (memory_capabilities::is_usm_type(mem->get_allocation_type())) {
|
||||
auto buf = std::dynamic_pointer_cast<const ocl::gpu_usm>(mem)->get_buffer();
|
||||
GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (usm) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
|
||||
return kernel.setArgUsm(idx, buf);
|
||||
} else {
|
||||
auto buf = std::dynamic_pointer_cast<const ocl::gpu_buffer>(mem)->get_buffer();
|
||||
GPU_DEBUG_TRACE_DETAIL << "kernel: " << kernel.get() << " set arg (buffer) " << idx << " mem: " << buf.get() << " size: " << mem->size() << std::endl;
|
||||
return kernel.setArg(idx, buf);
|
||||
}
|
||||
|
||||
return CL_INVALID_ARG_VALUE;
|
||||
}
|
||||
|
||||
void set_arguments_impl(ocl_kernel_type& kernel,
|
||||
const arguments_desc& args,
|
||||
const kernel_arguments_data& data) {
|
||||
@ -59,121 +81,44 @@ void set_arguments_impl(ocl_kernel_type& kernel,
|
||||
switch (args[i].t) {
|
||||
case args_t::INPUT:
|
||||
if (args[i].index < data.inputs.size() && data.inputs[args[i].index]) {
|
||||
const auto& input_mem = data.inputs[args[i].index];
|
||||
if (input_mem) {
|
||||
if (input_mem->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(input_mem)->get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.inputs[args[i].index]);
|
||||
}
|
||||
break;
|
||||
case args_t::INPUT_OF_FUSED_PRIMITIVE:
|
||||
if (args[i].index < data.fused_op_inputs.size() && data.fused_op_inputs[args[i].index]) {
|
||||
const auto& input_mem = data.fused_op_inputs[args[i].index];
|
||||
if (input_mem) {
|
||||
if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.fused_op_inputs[args[i].index]);
|
||||
}
|
||||
break;
|
||||
case args_t::INTERNAL_BUFFER:
|
||||
if (args[i].index < data.intermediates.size() && data.intermediates[args[i].index]) {
|
||||
const auto& input_mem = data.intermediates[args[i].index];
|
||||
if (input_mem) {
|
||||
if (memory_capabilities::is_usm_type(input_mem->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(input_mem)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(input_mem)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.intermediates[args[i].index]);
|
||||
}
|
||||
break;
|
||||
case args_t::OUTPUT:
|
||||
if (args[i].index < data.outputs.size() && data.outputs[args[i].index]) {
|
||||
const auto& output_mem = data.outputs[args[i].index];
|
||||
if (output_mem) {
|
||||
if (output_mem->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(output_mem)->get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(output_mem->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(output_mem)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(output_mem)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.outputs[args[i].index]);
|
||||
}
|
||||
break;
|
||||
case args_t::WEIGHTS:
|
||||
if (data.weights) {
|
||||
if (data.weights->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(data.weights)->get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(data.weights->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.weights);
|
||||
break;
|
||||
case args_t::BIAS:
|
||||
if (data.bias) {
|
||||
if (memory_capabilities::is_usm_type(data.bias->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.bias)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.bias)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.bias);
|
||||
break;
|
||||
case args_t::WEIGHTS_ZERO_POINTS:
|
||||
if (data.weights_zero_points) {
|
||||
if (memory_capabilities::is_usm_type(data.weights_zero_points->get_allocation_type()))
|
||||
status = kernel.setArgUsm(
|
||||
i,
|
||||
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.weights_zero_points)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(
|
||||
i,
|
||||
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.weights_zero_points)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.weights_zero_points);
|
||||
break;
|
||||
case args_t::ACTIVATIONS_ZERO_POINTS:
|
||||
if (data.activations_zero_points) {
|
||||
if (memory_capabilities::is_usm_type(data.activations_zero_points->get_allocation_type()))
|
||||
status = kernel.setArgUsm(
|
||||
i,
|
||||
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.activations_zero_points)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(
|
||||
i,
|
||||
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.activations_zero_points)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.activations_zero_points);
|
||||
break;
|
||||
case args_t::COMPENSATION:
|
||||
if (data.compensation) {
|
||||
if (memory_capabilities::is_usm_type(data.compensation->get_allocation_type()))
|
||||
status = kernel.setArgUsm(
|
||||
i,
|
||||
std::dynamic_pointer_cast<const ocl::gpu_usm>(data.compensation)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(
|
||||
i,
|
||||
std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.compensation)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.compensation);
|
||||
break;
|
||||
case args_t::SCALE_TABLE:
|
||||
if (data.scale_table) {
|
||||
if (memory_capabilities::is_usm_type(data.scale_table->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.scale_table)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.scale_table)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.scale_table);
|
||||
break;
|
||||
case args_t::SLOPE:
|
||||
if (data.slope) {
|
||||
if (memory_capabilities::is_usm_type(data.slope->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(data.slope)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(data.slope)->get_buffer());
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.slope);
|
||||
break;
|
||||
case args_t::SPLIT:
|
||||
status = kernel.setArg(i, data.split);
|
||||
@ -217,48 +162,17 @@ void set_arguments_impl(ocl_kernel_type& kernel,
|
||||
}
|
||||
}
|
||||
break;
|
||||
case args_t::RECURRENT: // RNN/LSTM/GRU layers
|
||||
if (data.recurrent) {
|
||||
if (data.recurrent->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.recurrent).get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(data.recurrent->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.recurrent).get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.recurrent).get_buffer());
|
||||
}
|
||||
case args_t::RECURRENT:
|
||||
status = set_kernel_arg(kernel, i, data.recurrent);
|
||||
break;
|
||||
case args_t::HIDDEN: // RNN/LSTM/GRU layers
|
||||
if (data.hidden) {
|
||||
if (data.hidden->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.hidden).get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(data.hidden->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.hidden).get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.hidden).get_buffer());
|
||||
}
|
||||
case args_t::HIDDEN:
|
||||
status = set_kernel_arg(kernel, i, data.hidden);
|
||||
break;
|
||||
case args_t::CELL: // LSTMlayers
|
||||
if (data.cell) {
|
||||
if (data.cell->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_image2d&>(*data.cell).get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(data.cell->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, dynamic_cast<const ocl::gpu_usm&>(*data.cell).get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, dynamic_cast<const ocl::gpu_buffer&>(*data.cell).get_buffer());
|
||||
}
|
||||
case args_t::CELL:
|
||||
status = set_kernel_arg(kernel, i, data.cell);
|
||||
break;
|
||||
case args_t::SHAPE_INFO:
|
||||
if (args[i].index == 0 && data.shape_info) {
|
||||
const auto& shape_info_mem = data.shape_info;
|
||||
if (shape_info_mem) {
|
||||
if (shape_info_mem->get_layout().format.is_image_2d())
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_image2d>(shape_info_mem)->get_buffer());
|
||||
else if (memory_capabilities::is_usm_type(shape_info_mem->get_allocation_type()))
|
||||
status = kernel.setArgUsm(i, std::dynamic_pointer_cast<const ocl::gpu_usm>(shape_info_mem)->get_buffer());
|
||||
else
|
||||
status = kernel.setArg(i, std::dynamic_pointer_cast<const ocl::gpu_buffer>(shape_info_mem)->get_buffer());
|
||||
}
|
||||
}
|
||||
status = set_kernel_arg(kernel, i, data.shape_info);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@ -359,6 +273,7 @@ void ocl_stream::set_arguments(kernel& kernel, const kernel_arguments_desc& args
|
||||
auto& kern = ocl_kernel.get_handle();
|
||||
|
||||
try {
|
||||
GPU_DEBUG_TRACE_DETAIL << "Set arguments for primitive: " << args_desc.layerID << " (" << kern.get() << ")\n";
|
||||
set_arguments_impl(kern, args_desc.arguments, args);
|
||||
} catch (cl::Error const& err) {
|
||||
throw ocl_error(err);
|
||||
|
Loading…
Reference in New Issue
Block a user