From 10ac5b280bef86a41f7226dae88c16464f5a2cd7 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Fri, 18 Feb 2022 10:27:54 +0900 Subject: [PATCH] [GPU] Mixed precision fix for mask rcnn (#10467) * Select proper layout for fp16-int8 mixed precision network * Set proper layout in layout propagation for mixed precision --- .../intel_gpu/src/graph/convolution.cpp | 21 ++++++- .../remove_redundant_reorders.cpp | 20 +++++++ .../graph/graph_optimizer/reorder_inputs.cpp | 55 +++++++++++-------- 3 files changed, 70 insertions(+), 26 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/convolution.cpp b/src/plugins/intel_gpu/src/graph/convolution.cpp index 9105e987eb9..208b1c45725 100644 --- a/src/plugins/intel_gpu/src/graph/convolution.cpp +++ b/src/plugins/intel_gpu/src/graph/convolution.cpp @@ -165,6 +165,22 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { input_layout.data_padding}; } + // Adjust output format for mixed precision case in onednn + auto out_fmt = input_layout.format; + if (node.get_preferred_impl_type() == impl_types::onednn) { + if (data_type_traits::is_i8_u8(output_type)) { + if (input_layout.format == format::b_fs_yx_fsv16) + out_fmt = format::b_fs_yx_fsv32; + else if (input_layout.format == format::bs_fs_yx_bsv32_fsv16) + out_fmt = format::bs_fs_yx_bsv32_fsv32; + } else if (data_type_traits::is_floating_point(output_type)) { + if (input_layout.format == format::b_fs_yx_fsv32) + out_fmt = format::b_fs_yx_fsv16; + else if (input_layout.format == format::bs_fs_yx_bsv32_fsv32) + out_fmt = format::bs_fs_yx_bsv32_fsv16; + } + } + // get output feature map from weights. It should be the same as number of biases. Will be verifed in // convolution::create() auto group = desc->groups; @@ -208,7 +224,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { return {output_type, format::b_fs_yx_32fp, output_size}; } - return {output_type, input_layout.format, output_size}; + return {output_type, out_fmt, output_size}; } auto output_range = calc_sliding_window_output_range(input_layout.size, @@ -231,8 +247,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) { if (output_type == data_types::bin) { return {output_type, format::b_fs_yx_32fp, output_size}; } - - return {output_type, input_layout.format, output_size}; + return {output_type, out_fmt, output_size}; } std::string convolution_inst::to_string(convolution_node const& node) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 00729427a12..eb6b8ca1ea6 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -16,15 +16,21 @@ #include "permute_inst.h" #include "depth_to_space_inst.h" #include "region_yolo_inst.h" +#include "intel_gpu/runtime/debug_configuration.hpp" using namespace cldnn; +#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) { \ + GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; } + + remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations, bool remove_output_reorders) : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations), remove_output_reorders(remove_output_reorders) {} void remove_redundant_reorders::run(program& p) { + GPU_DEBUG_GET_INSTANCE(debug_config); auto update_implementation = [&](program_node& node) { if (!update_implementations) return; @@ -113,6 +119,7 @@ void remove_redundant_reorders::run(program& p) { } node.can_be_optimized(true); + LOG_NODE_REMOVAL(node.id()); p.extract_and_remove(node); for (auto rl : recalc_list) { @@ -168,6 +175,7 @@ void remove_redundant_reorders::run(program& p) { dep_prim->output_format = output_layout.format; dep_prim->output_data_type = output_layout.data_type; + LOG_NODE_REMOVAL(r_node.id()); r_node.can_be_optimized(true); p.add_optimized_primitive_info(r_node.id()); p.extract_and_remove(r_node); @@ -246,6 +254,8 @@ void remove_redundant_reorders::run(program& p) { } else { p.add_optimized_primitive_info(r_node.get_primitive()->id); } + + LOG_NODE_REMOVAL(r_node.id()); p.extract_and_remove( r_node); // try to remove if possible (with respect to r_node not being marked as output) } @@ -292,6 +302,8 @@ void remove_redundant_reorders::run(program& p) { // pointing to, we should increment it again if (remove_reorder_node == *itr) itr++; + + LOG_NODE_REMOVAL(remove_reorder_node->id()); p.replace_all_usages(*remove_reorder_node, *node); p.add_optimized_primitive_info(remove_reorder_node->id()); p.remove_all_connections(*remove_reorder_node); @@ -336,6 +348,8 @@ void remove_redundant_reorders::run(program& p) { if (input.type()->does_possible_implementation_exist(input)) { node.can_be_optimized(true); p.add_optimized_primitive_info(node.id()); + + LOG_NODE_REMOVAL(node.id()); p.extract_and_remove(node); } else { input.set_output_layout(old_output_layout_of_input, false); @@ -363,6 +377,8 @@ void remove_redundant_reorders::run(program& p) { continue; dep.merge_output_padding(node.get_output_layout().data_padding); + + LOG_NODE_REMOVAL(node.id()); p.replace_all_usages(node, dep); p.add_optimized_primitive_info(node.id()); p.remove_all_connections(node); @@ -394,6 +410,7 @@ void remove_redundant_reorders::run(program& p) { return false; dep.merge_output_padding(node->get_output_layout().data_padding); + LOG_NODE_REMOVAL(node->id()); p.replace_all_usages(*node, dep); p.get_processing_order().erase(node); p.add_optimized_primitive_info(node->id()); @@ -455,6 +472,7 @@ void remove_redundant_reorders::run(program& p) { node->set_input_layout(local_desc.input_layout); // remove reorder node + LOG_NODE_REMOVAL(node->id()); node->can_be_optimized(true); p.add_optimized_primitive_info(node->id()); p.extract_and_remove(*node); @@ -522,12 +540,14 @@ void remove_redundant_reorders::run(program& p) { reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty(); if (remove_dep) { + LOG_NODE_REMOVAL(reshape_input_node.id()); reshape_input_node.can_be_optimized(true); p.add_optimized_primitive_info(reshape_input_node.id()); p.extract_and_remove(reshape_input_node); } if (remove_current) { + LOG_NODE_REMOVAL(reshape_node.id()); reshape_node.can_be_optimized(true); p.add_optimized_primitive_info(reshape_node.id()); p.extract_and_remove(reshape_node); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 779eab64e39..13421c69130 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -20,6 +20,7 @@ #include #include #include +#include using namespace cldnn; @@ -562,7 +563,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) } }; - const auto reorder_convolution = [&p, &lo, &rf](typed_program_node& conv_node) { + const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node& conv_node) { { // reorder weights convolution auto& weights = conv_node.weights(); @@ -602,35 +603,43 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) conv_node.get_dependencies().front()->set_output_layout(new_layout, false); } - std::vector wrong_format = {format::b_fs_yx_fsv16, format::bs_fs_yx_bsv32_fsv16}; - std::vector correct_format = {format::b_fs_yx_fsv32, format::bs_fs_yx_bsv32_fsv32}; - for (int i = 0; i < wrong_format.size(); i++) { - // reorder for onednn mixed-precision conv - // If the layouts are like below, change input layout to fsv32. - // From: - // (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16) - // To: - // (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16) - // - // Do not apply such change for b=1 first conv - + // reorder for onednn mixed-precision conv + // If the layouts are like below, change input layout to fsv32. + // From: + // (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16) + // To: + // (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16) + // + // Do not apply such change for b=1 first conv + enum class __data_type {i8_u8, floating_point}; + // Errata for mixed precision in onednn + // data_type, wrong_format, correct_format + std::vector> errata = { + {__data_type::i8_u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv32}, + {__data_type::i8_u8, format::bs_fs_yx_bsv32_fsv16, format::bs_fs_yx_bsv32_fsv32}, + {__data_type::floating_point, format::b_fs_yx_fsv32, format::b_fs_yx_fsv16}, + {__data_type::floating_point, format::bs_fs_yx_bsv32_fsv32, format::bs_fs_yx_bsv32_fsv16}}; + for (auto &e : errata) { auto prev_node = conv_node.get_dependencies().front(); - auto old_layout = prev_node->get_output_layout(); + auto prev_layout = prev_node->get_output_layout(); auto conv_layout = conv_node.get_output_layout(); + auto is_target_dt_in_errata = (std::get<0>(e) == __data_type::i8_u8 && data_type_traits::is_i8_u8(prev_layout.data_type)) || + (std::get<0>(e) == __data_type::floating_point && data_type_traits::is_floating_point(prev_layout.data_type)); + auto wrong_format = std::get<1>(e); + auto correct_format = std::get<2>(e); if (lo.get_optimization_attributes().use_onednn_impls - && conv_layout.format == wrong_format[i] - && data_type_traits::is_i8_u8(old_layout.data_type) - && (old_layout.format == wrong_format[i]) - && !(old_layout.size.batch[0] == 1 && old_layout.size.feature[0] <= 4)) { - auto new_layout = old_layout; - new_layout.format = correct_format[i]; + && is_target_dt_in_errata + && conv_layout.format == wrong_format + && prev_layout.format == wrong_format + && !(prev_layout.size.batch[0] == 1 && prev_layout.size.feature[0] <= 4)) { + auto new_layout = prev_layout; + new_layout.format = correct_format; auto new_input = rf.get_reorder(prev_node->id(), - old_layout, + prev_layout, new_layout); - if (new_input.first) { + if (new_input.first) p.add_intermediate(new_input.first, conv_node, 0, !new_input.second); - } // Prevent layout propagation as we are using mixed precision for conv conv_node.get_dependencies().front()->set_output_layout(new_layout, false);