diff --git a/src/plugins/intel_gpu/src/graph/convolution.cpp b/src/plugins/intel_gpu/src/graph/convolution.cpp index 208b1c45725..9105e987eb9 100644 --- a/src/plugins/intel_gpu/src/graph/convolution.cpp +++ b/src/plugins/intel_gpu/src/graph/convolution.cpp @@ -165,22 +165,6 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { input_layout.data_padding}; } - // Adjust output format for mixed precision case in onednn - auto out_fmt = input_layout.format; - if (node.get_preferred_impl_type() == impl_types::onednn) { - if (data_type_traits::is_i8_u8(output_type)) { - if (input_layout.format == format::b_fs_yx_fsv16) - out_fmt = format::b_fs_yx_fsv32; - else if (input_layout.format == format::bs_fs_yx_bsv32_fsv16) - out_fmt = format::bs_fs_yx_bsv32_fsv32; - } else if (data_type_traits::is_floating_point(output_type)) { - if (input_layout.format == format::b_fs_yx_fsv32) - out_fmt = format::b_fs_yx_fsv16; - else if (input_layout.format == format::bs_fs_yx_bsv32_fsv32) - out_fmt = format::bs_fs_yx_bsv32_fsv16; - } - } - // get output feature map from weights. It should be the same as number of biases. Will be verifed in // convolution::create() auto group = desc->groups; @@ -224,7 +208,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { return {output_type, format::b_fs_yx_32fp, output_size}; } - return {output_type, out_fmt, output_size}; + return {output_type, input_layout.format, output_size}; } auto output_range = calc_sliding_window_output_range(input_layout.size, @@ -247,7 +231,8 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { if (output_type == data_types::bin) { return {output_type, format::b_fs_yx_32fp, output_size}; } - return {output_type, out_fmt, output_size}; + + return {output_type, input_layout.format, output_size}; } std::string convolution_inst::to_string(convolution_node const& node) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index eb6b8ca1ea6..00729427a12 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -16,21 +16,15 @@ #include "permute_inst.h" #include "depth_to_space_inst.h" #include "region_yolo_inst.h" -#include "intel_gpu/runtime/debug_configuration.hpp" using namespace cldnn; -#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \ - GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; } - - remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations, bool remove_output_reorders) : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations), remove_output_reorders(remove_output_reorders) {} void remove_redundant_reorders::run(program& p) { - GPU_DEBUG_GET_INSTANCE(debug_config); auto update_implementation = [&](program_node& node) { if (!update_implementations) return; @@ -119,7 +113,6 @@ void remove_redundant_reorders::run(program& p) { } node.can_be_optimized(true); - LOG_NODE_REMOVAL(node.id()); p.extract_and_remove(node); for (auto rl : recalc_list) { @@ -175,7 +168,6 @@ void remove_redundant_reorders::run(program& p) { dep_prim->output_format = output_layout.format; dep_prim->output_data_type = output_layout.data_type; - LOG_NODE_REMOVAL(r_node.id()); r_node.can_be_optimized(true); p.add_optimized_primitive_info(r_node.id()); p.extract_and_remove(r_node); @@ -254,8 +246,6 @@ void remove_redundant_reorders::run(program& p) { } else { p.add_optimized_primitive_info(r_node.get_primitive()->id); } - - LOG_NODE_REMOVAL(r_node.id()); p.extract_and_remove( r_node); // try to remove if possible (with respect to r_node not being marked as output) } @@ -302,8 +292,6 @@ void remove_redundant_reorders::run(program& p) { // pointing to, we should increment it again if (remove_reorder_node == *itr) itr++; - - LOG_NODE_REMOVAL(remove_reorder_node->id()); p.replace_all_usages(*remove_reorder_node, *node); p.add_optimized_primitive_info(remove_reorder_node->id()); p.remove_all_connections(*remove_reorder_node); @@ -348,8 +336,6 @@ void remove_redundant_reorders::run(program& p) { if (input.type()->does_possible_implementation_exist(input)) { node.can_be_optimized(true); p.add_optimized_primitive_info(node.id()); - - LOG_NODE_REMOVAL(node.id()); p.extract_and_remove(node); } else { input.set_output_layout(old_output_layout_of_input, false); @@ -377,8 +363,6 @@ void remove_redundant_reorders::run(program& p) { continue; dep.merge_output_padding(node.get_output_layout().data_padding); - - LOG_NODE_REMOVAL(node.id()); p.replace_all_usages(node, dep); p.add_optimized_primitive_info(node.id()); p.remove_all_connections(node); @@ -410,7 +394,6 @@ void remove_redundant_reorders::run(program& p) { return false; dep.merge_output_padding(node->get_output_layout().data_padding); - LOG_NODE_REMOVAL(node->id()); p.replace_all_usages(*node, dep); p.get_processing_order().erase(node); p.add_optimized_primitive_info(node->id()); @@ -472,7 +455,6 @@ void remove_redundant_reorders::run(program& p) { node->set_input_layout(local_desc.input_layout); // remove reorder node - LOG_NODE_REMOVAL(node->id()); node->can_be_optimized(true); p.add_optimized_primitive_info(node->id()); p.extract_and_remove(*node); @@ -540,14 +522,12 @@ void remove_redundant_reorders::run(program& p) { reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty(); if (remove_dep) { - LOG_NODE_REMOVAL(reshape_input_node.id()); reshape_input_node.can_be_optimized(true); p.add_optimized_primitive_info(reshape_input_node.id()); p.extract_and_remove(reshape_input_node); } if (remove_current) { - LOG_NODE_REMOVAL(reshape_node.id()); reshape_node.can_be_optimized(true); p.add_optimized_primitive_info(reshape_node.id()); p.extract_and_remove(reshape_node); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 13421c69130..779eab64e39 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -20,7 +20,6 @@ #include #include #include -#include using namespace cldnn; @@ -563,7 +562,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) } }; - const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node& conv_node) { + const auto reorder_convolution = [&p, &lo, &rf](typed_program_node& conv_node) { { // reorder weights convolution auto& weights = conv_node.weights(); @@ -603,43 +602,35 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) conv_node.get_dependencies().front()->set_output_layout(new_layout, false); } - // reorder for onednn mixed-precision conv - // If the layouts are like below, change input layout to fsv32. - // From: - // (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16) - // To: - // (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16) - // - // Do not apply such change for b=1 first conv - enum class __data_type {i8_u8, floating_point}; - // Errata for mixed precision in onednn - // data_type, wrong_format, correct_format - std::vector> errata = { - {__data_type::i8_u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv32}, - {__data_type::i8_u8, format::bs_fs_yx_bsv32_fsv16, format::bs_fs_yx_bsv32_fsv32}, - {__data_type::floating_point, format::b_fs_yx_fsv32, format::b_fs_yx_fsv16}, - {__data_type::floating_point, format::bs_fs_yx_bsv32_fsv32, format::bs_fs_yx_bsv32_fsv16}}; - for (auto &e : errata) { + std::vector wrong_format = {format::b_fs_yx_fsv16, format::bs_fs_yx_bsv32_fsv16}; + std::vector correct_format = {format::b_fs_yx_fsv32, format::bs_fs_yx_bsv32_fsv32}; + for (int i = 0; i < wrong_format.size(); i++) { + // reorder for onednn mixed-precision conv + // If the layouts are like below, change input layout to fsv32. + // From: + // (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16) + // To: + // (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16) + // + // Do not apply such change for b=1 first conv + auto prev_node = conv_node.get_dependencies().front(); - auto prev_layout = prev_node->get_output_layout(); + auto old_layout = prev_node->get_output_layout(); auto conv_layout = conv_node.get_output_layout(); - auto is_target_dt_in_errata = (std::get<0>(e) == __data_type::i8_u8 && data_type_traits::is_i8_u8(prev_layout.data_type)) || - (std::get<0>(e) == __data_type::floating_point && data_type_traits::is_floating_point(prev_layout.data_type)); - auto wrong_format = std::get<1>(e); - auto correct_format = std::get<2>(e); if (lo.get_optimization_attributes().use_onednn_impls - && is_target_dt_in_errata - && conv_layout.format == wrong_format - && prev_layout.format == wrong_format - && !(prev_layout.size.batch[0] == 1 && prev_layout.size.feature[0] <= 4)) { - auto new_layout = prev_layout; - new_layout.format = correct_format; + && conv_layout.format == wrong_format[i] + && data_type_traits::is_i8_u8(old_layout.data_type) + && (old_layout.format == wrong_format[i]) + && !(old_layout.size.batch[0] == 1 && old_layout.size.feature[0] <= 4)) { + auto new_layout = old_layout; + new_layout.format = correct_format[i]; auto new_input = rf.get_reorder(prev_node->id(), - prev_layout, + old_layout, new_layout); - if (new_input.first) + if (new_input.first) { p.add_intermediate(new_input.first, conv_node, 0, !new_input.second); + } // Prevent layout propagation as we are using mixed precision for conv conv_node.get_dependencies().front()->set_output_layout(new_layout, false);