[GPU] Resolve fp16 overflow of mul (#19173)

+ Fixed black output image by Nan output. + Resolved fp16 overflow of gemm primitive before softmax + Added fused post ops of clamp activation not to get inf which caused Nan output + Added new pass modify_fused_ops Signed-off-by: Min, Byungil <byungil.min@intel.com>
2023-08-17 18:09:10 +09:00 · 2023-08-17 18:09:10 +09:00 · 1b9de79d0d
commit 1b9de79d0d
parent 49bbcb4cf6
3 changed files with 53 additions and 0 deletions
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/clamp_fp16_output.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/clamp_fp16_output.cpp
@ -0,0 +1,41 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "pass_manager.h"
+#include "program_node.h"
+
+#include "gemm_inst.h"
+#include "reshape_inst.h"
+#include "softmax_inst.h"
+
+using namespace cldnn;
+
+void clamp_fp16_output::run(program& p) {
+    for (auto& node : p.get_processing_order()) {
+        // Add clamp activation to avoid inf result which causes Nan output
+        if (node->is_type<gemm>() && !node->is_output() && node->get_output_layout().data_type == data_types::f16) {
+            auto user = node->get_users().front();
+            // Reshape could be added in CreateMatMulOp : check a user node of the Reshape
+            if (user->is_type<reshape>())
+                user = user->get_users().front();
+
+            if (user->is_type<softmax>()) {
+                float out_lo = data_type_traits::min<float>(data_types::f16);
+                float out_hi = data_type_traits::max<float>(data_types::f16);
+                auto activ_id = node->id() + "_overflow_clip";
+                auto activ = std::make_shared<activation>(activ_id, input_info(node->id()),
+                    activation_func::clamp, activation_additional_params{out_lo, out_hi});
+                program_node& act_node = p.get_or_create(activ);
+
+                fused_primitive_desc local_desc(activ);
+                local_desc.input_layout = node->get_output_layout();
+                local_desc.f_param = act_node.get_fuse_params();
+                local_desc.outer_dep_start_idx = -1;  // No external dep
+                local_desc.total_num_deps = 0;
+                local_desc.output_layout = node->get_output_layout();
+                node->add_fused_primitive(local_desc);
+            }
+        }
+    }
+}
--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@ -104,6 +104,14 @@ private:
    void run(program& p) override;
 };

+class clamp_fp16_output : public base_pass {
+public:
+    clamp_fp16_output() : base_pass("clamp_fp16_output") {}
+
+private:
+    void run(program& p) override;
+};
+
 class mark_shape_of_subgraphs : public base_pass {
    // This optimization pass aggregates nodes into shape_of subgraphs for further optimizations.
    // There are few key requirements to decide if node belongs to shape_of subgraph or not:
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -593,6 +593,10 @@ void program::pre_optimize_graph(bool is_internal) {
    // check if there exists some layout incompatibilities and add an reorder node if required
    apply_opt_pass<add_required_reorders>();

+    // Modify fused post operation to resolve overflow of fp16 output by adding clamp activation
+    // Currently, 'gemm-softmax' case is applied for clamping
+    apply_opt_pass<clamp_fp16_output>();
+
    // add optimization attributes for onednn primitives
    apply_opt_pass<add_onednn_optimization_attributes>();