From 1b9de79d0d117cd8cc6f2505b417da83a345dd1a Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Thu, 17 Aug 2023 18:09:10 +0900 Subject: [PATCH] [GPU] Resolve fp16 overflow of mul (#19173) + Fixed black output image by Nan output. + Resolved fp16 overflow of gemm primitive before softmax + Added fused post ops of clamp activation not to get inf which caused Nan output + Added new pass modify_fused_ops Signed-off-by: Min, Byungil --- .../graph_optimizer/clamp_fp16_output.cpp | 41 +++++++++++++++++++ .../src/graph/include/pass_manager.h | 8 ++++ src/plugins/intel_gpu/src/graph/program.cpp | 4 ++ 3 files changed, 53 insertions(+) create mode 100644 src/plugins/intel_gpu/src/graph/graph_optimizer/clamp_fp16_output.cpp diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/clamp_fp16_output.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/clamp_fp16_output.cpp new file mode 100644 index 00000000000..f74d33cca73 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/clamp_fp16_output.cpp @@ -0,0 +1,41 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "pass_manager.h" +#include "program_node.h" + +#include "gemm_inst.h" +#include "reshape_inst.h" +#include "softmax_inst.h" + +using namespace cldnn; + +void clamp_fp16_output::run(program& p) { + for (auto& node : p.get_processing_order()) { + // Add clamp activation to avoid inf result which causes Nan output + if (node->is_type() && !node->is_output() && node->get_output_layout().data_type == data_types::f16) { + auto user = node->get_users().front(); + // Reshape could be added in CreateMatMulOp : check a user node of the Reshape + if (user->is_type()) + user = user->get_users().front(); + + if (user->is_type()) { + float out_lo = data_type_traits::min(data_types::f16); + float out_hi = data_type_traits::max(data_types::f16); + auto activ_id = node->id() + "_overflow_clip"; + auto activ = std::make_shared(activ_id, input_info(node->id()), + activation_func::clamp, activation_additional_params{out_lo, out_hi}); + program_node& act_node = p.get_or_create(activ); + + fused_primitive_desc local_desc(activ); + local_desc.input_layout = node->get_output_layout(); + local_desc.f_param = act_node.get_fuse_params(); + local_desc.outer_dep_start_idx = -1; // No external dep + local_desc.total_num_deps = 0; + local_desc.output_layout = node->get_output_layout(); + node->add_fused_primitive(local_desc); + } + } + } +} diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index de9ec170096..e507014f233 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -104,6 +104,14 @@ private: void run(program& p) override; }; +class clamp_fp16_output : public base_pass { +public: + clamp_fp16_output() : base_pass("clamp_fp16_output") {} + +private: + void run(program& p) override; +}; + class mark_shape_of_subgraphs : public base_pass { // This optimization pass aggregates nodes into shape_of subgraphs for further optimizations. // There are few key requirements to decide if node belongs to shape_of subgraph or not: diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 6f25eb6760c..4e7bc297cb1 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -593,6 +593,10 @@ void program::pre_optimize_graph(bool is_internal) { // check if there exists some layout incompatibilities and add an reorder node if required apply_opt_pass(); + // Modify fused post operation to resolve overflow of fp16 output by adding clamp activation + // Currently, 'gemm-softmax' case is applied for clamping + apply_opt_pass(); + // add optimization attributes for onednn primitives apply_opt_pass();