From 10ac5b280bef86a41f7226dae88c16464f5a2cd7 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Fri, 18 Feb 2022 10:27:54 +0900
Subject: [PATCH] [GPU] Mixed precision fix for mask rcnn (#10467)

* Select proper layout for fp16-int8 mixed precision network
* Set proper layout in layout propagation for mixed precision
---
 .../intel_gpu/src/graph/convolution.cpp       | 21 ++++++-
 .../remove_redundant_reorders.cpp             | 20 +++++++
 .../graph/graph_optimizer/reorder_inputs.cpp  | 55 +++++++++++--------
 3 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/convolution.cpp b/src/plugins/intel_gpu/src/graph/convolution.cpp
index 9105e987eb9..208b1c45725 100644
--- a/src/plugins/intel_gpu/src/graph/convolution.cpp
+++ b/src/plugins/intel_gpu/src/graph/convolution.cpp
@@ -165,6 +165,22 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
                       input_layout.data_padding};
     }
 
+    // Adjust output format for mixed precision case in onednn
+    auto out_fmt = input_layout.format;
+    if (node.get_preferred_impl_type() == impl_types::onednn) {
+        if (data_type_traits::is_i8_u8(output_type)) {
+            if (input_layout.format == format::b_fs_yx_fsv16)
+                out_fmt = format::b_fs_yx_fsv32;
+            else if (input_layout.format == format::bs_fs_yx_bsv32_fsv16)
+                out_fmt = format::bs_fs_yx_bsv32_fsv32;
+        } else if (data_type_traits::is_floating_point(output_type)) {
+            if (input_layout.format == format::b_fs_yx_fsv32)
+                out_fmt = format::b_fs_yx_fsv16;
+            else if (input_layout.format == format::bs_fs_yx_bsv32_fsv32)
+                out_fmt = format::bs_fs_yx_bsv32_fsv16;
+        }
+    }
+
     // get output feature map from weights. It should be the same as number of biases. Will be verifed in
     // convolution::create()
     auto group = desc->groups;
@@ -208,7 +224,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
             return {output_type, format::b_fs_yx_32fp, output_size};
         }
 
-        return {output_type, input_layout.format, output_size};
+        return {output_type, out_fmt, output_size};
     }
 
     auto output_range = calc_sliding_window_output_range<swor_mode::all>(input_layout.size,
@@ -231,8 +247,7 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
     if (output_type == data_types::bin) {
         return {output_type, format::b_fs_yx_32fp, output_size};
     }
-
-    return {output_type, input_layout.format, output_size};
+    return {output_type, out_fmt, output_size};
 }
 
 std::string convolution_inst::to_string(convolution_node const& node) {
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
index 00729427a12..eb6b8ca1ea6 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -16,15 +16,21 @@
 #include "permute_inst.h"
 #include "depth_to_space_inst.h"
 #include "region_yolo_inst.h"
+#include "intel_gpu/runtime/debug_configuration.hpp"
 
 using namespace cldnn;
 
+#define LOG_NODE_REMOVAL(id) GPU_DEBUG_IF(debug_config->verbose >= 2) {                                                         \
+                GPU_DEBUG_COUT << "[remove_redundant_reorders:" << __LINE__ << "] " << "Remove node: " << (id) << std::endl; }
+
+
 remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
     bool remove_output_reorders)
     : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations),
     remove_output_reorders(remove_output_reorders) {}
 
 void remove_redundant_reorders::run(program& p) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
     auto update_implementation = [&](program_node& node) {
         if (!update_implementations)
             return;
@@ -113,6 +119,7 @@ void remove_redundant_reorders::run(program& p) {
             }
 
             node.can_be_optimized(true);
+            LOG_NODE_REMOVAL(node.id());
             p.extract_and_remove(node);
 
             for (auto rl : recalc_list) {
@@ -168,6 +175,7 @@ void remove_redundant_reorders::run(program& p) {
             dep_prim->output_format = output_layout.format;
             dep_prim->output_data_type = output_layout.data_type;
 
+            LOG_NODE_REMOVAL(r_node.id());
             r_node.can_be_optimized(true);
             p.add_optimized_primitive_info(r_node.id());
             p.extract_and_remove(r_node);
@@ -246,6 +254,8 @@ void remove_redundant_reorders::run(program& p) {
             } else {
                 p.add_optimized_primitive_info(r_node.get_primitive()->id);
             }
+
+            LOG_NODE_REMOVAL(r_node.id());
             p.extract_and_remove(
                 r_node);  // try to remove if possible (with respect to r_node not being marked as output)
         }
@@ -292,6 +302,8 @@ void remove_redundant_reorders::run(program& p) {
             // pointing to, we should increment it again
             if (remove_reorder_node == *itr)
                 itr++;
+
+            LOG_NODE_REMOVAL(remove_reorder_node->id());
             p.replace_all_usages(*remove_reorder_node, *node);
             p.add_optimized_primitive_info(remove_reorder_node->id());
             p.remove_all_connections(*remove_reorder_node);
@@ -336,6 +348,8 @@ void remove_redundant_reorders::run(program& p) {
             if (input.type()->does_possible_implementation_exist(input)) {
                 node.can_be_optimized(true);
                 p.add_optimized_primitive_info(node.id());
+
+                LOG_NODE_REMOVAL(node.id());
                 p.extract_and_remove(node);
             } else {
                 input.set_output_layout(old_output_layout_of_input, false);
@@ -363,6 +377,8 @@ void remove_redundant_reorders::run(program& p) {
             continue;
 
         dep.merge_output_padding(node.get_output_layout().data_padding);
+
+        LOG_NODE_REMOVAL(node.id());
         p.replace_all_usages(node, dep);
         p.add_optimized_primitive_info(node.id());
         p.remove_all_connections(node);
@@ -394,6 +410,7 @@ void remove_redundant_reorders::run(program& p) {
             return false;
 
         dep.merge_output_padding(node->get_output_layout().data_padding);
+        LOG_NODE_REMOVAL(node->id());
         p.replace_all_usages(*node, dep);
         p.get_processing_order().erase(node);
         p.add_optimized_primitive_info(node->id());
@@ -455,6 +472,7 @@ void remove_redundant_reorders::run(program& p) {
             node->set_input_layout(local_desc.input_layout);
 
             // remove reorder node
+            LOG_NODE_REMOVAL(node->id());
             node->can_be_optimized(true);
             p.add_optimized_primitive_info(node->id());
             p.extract_and_remove(*node);
@@ -522,12 +540,14 @@ void remove_redundant_reorders::run(program& p) {
                               reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty();
 
         if (remove_dep) {
+            LOG_NODE_REMOVAL(reshape_input_node.id());
             reshape_input_node.can_be_optimized(true);
             p.add_optimized_primitive_info(reshape_input_node.id());
             p.extract_and_remove(reshape_input_node);
         }
 
         if (remove_current) {
+            LOG_NODE_REMOVAL(reshape_node.id());
             reshape_node.can_be_optimized(true);
             p.add_optimized_primitive_info(reshape_node.id());
             p.extract_and_remove(reshape_node);
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
index 779eab64e39..13421c69130 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -20,6 +20,7 @@
 #include <list>
 #include <map>
 #include <set>
+#include <tuple>
 
 using namespace cldnn;
 
@@ -562,7 +563,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
         }
     };
 
-    const auto reorder_convolution = [&p, &lo, &rf](typed_program_node<convolution>& conv_node) {
+    const auto reorder_convolution = [&p, &lo, &rf, &debug_config](typed_program_node<convolution>& conv_node) {
         {
             // reorder weights convolution
             auto& weights = conv_node.weights();
@@ -602,35 +603,43 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
             conv_node.get_dependencies().front()->set_output_layout(new_layout, false);
         }
 
-        std::vector<format> wrong_format = {format::b_fs_yx_fsv16, format::bs_fs_yx_bsv32_fsv16};
-        std::vector<format> correct_format = {format::b_fs_yx_fsv32, format::bs_fs_yx_bsv32_fsv32};
-        for (int i = 0; i < wrong_format.size(); i++) {
-            // reorder for onednn mixed-precision conv
-            // If the layouts are like below, change input layout to fsv32.
-            // From:
-            //   (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16)
-            // To:
-            //   (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16)
-            //
-            // Do not apply such change for b=1 first conv
-
+        // reorder for onednn mixed-precision conv
+        // If the layouts are like below, change input layout to fsv32.
+        // From:
+        //   (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16)
+        // To:
+        //   (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16)
+        //
+        // Do not apply such change for b=1 first conv
+        enum class __data_type {i8_u8, floating_point};
+        // Errata for mixed precision in onednn
+        // data_type, wrong_format, correct_format
+        std::vector<std::tuple<__data_type, format, format>> errata = {
+            {__data_type::i8_u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv32},
+            {__data_type::i8_u8, format::bs_fs_yx_bsv32_fsv16, format::bs_fs_yx_bsv32_fsv32},
+            {__data_type::floating_point, format::b_fs_yx_fsv32, format::b_fs_yx_fsv16},
+            {__data_type::floating_point, format::bs_fs_yx_bsv32_fsv32, format::bs_fs_yx_bsv32_fsv16}};
+        for (auto &e : errata) {
             auto prev_node = conv_node.get_dependencies().front();
-            auto old_layout = prev_node->get_output_layout();
+            auto prev_layout = prev_node->get_output_layout();
             auto conv_layout = conv_node.get_output_layout();
+            auto is_target_dt_in_errata = (std::get<0>(e) == __data_type::i8_u8 && data_type_traits::is_i8_u8(prev_layout.data_type)) ||
+                                          (std::get<0>(e) == __data_type::floating_point && data_type_traits::is_floating_point(prev_layout.data_type));
+            auto wrong_format = std::get<1>(e);
+            auto correct_format = std::get<2>(e);
             if (lo.get_optimization_attributes().use_onednn_impls
-                    && conv_layout.format == wrong_format[i]
-                    && data_type_traits::is_i8_u8(old_layout.data_type)
-                    && (old_layout.format == wrong_format[i])
-                    && !(old_layout.size.batch[0] == 1 && old_layout.size.feature[0] <= 4)) {
-                auto new_layout = old_layout;
-                new_layout.format = correct_format[i];
+                    && is_target_dt_in_errata
+                    && conv_layout.format == wrong_format
+                    && prev_layout.format == wrong_format
+                    && !(prev_layout.size.batch[0] == 1 && prev_layout.size.feature[0] <= 4)) {
+                auto new_layout = prev_layout;
+                new_layout.format = correct_format;
                 auto new_input = rf.get_reorder(prev_node->id(),
-                                                old_layout,
+                                                prev_layout,
                                                 new_layout);
 
-                if (new_input.first) {
+                if (new_input.first)
                     p.add_intermediate(new_input.first, conv_node, 0, !new_input.second);
-                }
 
                 // Prevent layout propagation as we are using mixed precision for conv
                 conv_node.get_dependencies().front()->set_output_layout(new_layout, false);