[IE CLDNN] Remove unused fused deps for FQ (#712)

Remove unused fused FQ kernel arguments to avoid extra setArg() calls which significantly reduces host overhead
2020-06-04 10:30:46 +03:00
parent 546377dc8e
commit 28ffbf0857
3 changed files with 78 additions and 23 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_params.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -117,7 +117,25 @@ struct quantize_fuse_params : fuse_params {
    , in_scale(in_scale)
    , in_shift(in_shift)
    , out_scale(out_scale)
-    , out_shift(out_shift) { }
+    , out_shift(out_shift) {
+        size_t index = 0;
+        if (has_clamp) {
+            in_range_lo_idx = index++;
+            in_range_hi_idx = index++;
+        }
+        if (!per_tensor_input_scale) {
+            in_scale_idx = index++;
+        }
+        if (!per_tensor_input_shift && has_pre_shift) {
+            in_shift_idx = index++;
+        }
+        if (!per_tensor_output_scale && has_post_scale) {
+            out_scale_idx = index++;
+        }
+        if (!per_tensor_output_shift && has_post_shift) {
+            out_shift_idx = index++;
+        }
+    }

    bool scale_shift_opt;
    bool has_post_scale;
@@ -137,6 +155,13 @@ struct quantize_fuse_params : fuse_params {
    float in_shift;
    float out_scale;
    float out_shift;
+
+    size_t in_range_lo_idx;
+    size_t in_range_hi_idx;
+    size_t in_scale_idx;
+    size_t in_shift_idx;
+    size_t out_scale_idx;
+    size_t out_shift_idx;
 };

 }  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -1155,26 +1155,32 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati

            // We can't convert inputs to output data type, because it might be equal to UINT8 or INT8, so we convert the data
            // to the zero tensor's (input_lo) type
-            std::string tmp_var = in_var;
-            std::string tmp_type;
            std::string in_converted = in_var;
-            if (in_type != desc.tensors[0].GetDType()) {
-                tmp_type = GetType(desc.tensors[0].GetDType(), vec_size);
-                tmp_var = out_var + "_tmp";
+            Datatype tmp_type = desc.tensors.empty() ? in_type : desc.tensors[0].GetDType();
+            std::string tmp_type_str = GetType(tmp_type, vec_size);
+            std::string tmp_var = out_var + "_tmp";
+
+            if (in_type != tmp_type) {
                in_converted = ConvertToType(in_var, desc.tensors[0].GetDType(), vec_size);
            }

-            auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(6);
-            auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(7);
-            auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(4);
-            auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(5);
-            auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(0);
-            auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(1);
+            auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
+                                                         : GetInputVarName(p->out_scale_idx);
+            auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), tmp_type, vec_size)
+                                                         : GetInputVarName(p->out_shift_idx);
+            auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), tmp_type, vec_size)
+                                                       : GetInputVarName(p->in_scale_idx);
+            auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), tmp_type, vec_size)
+                                                       : GetInputVarName(p->in_shift_idx);
+            auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), tmp_type, vec_size)
+                                                   : GetInputVarName(p->in_range_lo_idx);
+            auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), tmp_type, vec_size)
+                                                   : GetInputVarName(p->in_range_hi_idx);

            if (p->has_clamp) {
-                op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
+                op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
            } else {
-                op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = " + in_converted + ";";
+                op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = " + in_converted + ";";
            }
            op_decls += "\\\n\t" + tmp_var + " = " + tmp_var + "*" + pre_scale + ";";
            if (p->has_pre_shift)
@@ -1401,17 +1407,17 @@ std::vector<size_t> FusedOpsCodeGenerator::GetRequiredInputs() const {
            if (p) {
                std::vector<size_t> res = {};
                if (!p->per_tensor_input_range && p->has_clamp) {
-                    res.push_back(0);
-                    res.push_back(1);
+                    res.push_back(p->in_range_lo_idx);
+                    res.push_back(p->in_range_hi_idx);
                }
                if (!p->per_tensor_input_scale)
-                    res.push_back(4);
+                    res.push_back(p->in_scale_idx);
                if (p->has_pre_shift && !p->per_tensor_input_shift)
-                    res.push_back(5);
+                    res.push_back(p->in_shift_idx);
                if (p->has_post_scale && !p->per_tensor_output_scale)
-                    res.push_back(6);
+                    res.push_back(p->out_scale_idx);
                if (p->has_post_shift && !p->per_tensor_output_shift)
-                    res.push_back(7);
+                    res.push_back(p->out_shift_idx);

                return res;
            }
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 #include "binary_convolution_inst.h"
 #include "resample_inst.h"
 #include "reshape_inst.h"
+#include "quantize_inst.h"
 #include "activation_inst.h"
 #include "scale_inst.h"
 #include "depth_to_space_inst.h"
@@ -906,6 +907,29 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
        auto& dep = peer_node.get_dependency(i);
        if (dep.id() == fused_node.id())
            continue;
+
+        if (peer_node.is_type<quantize>()) {
+            quantize_node& q_node = peer_node.as<quantize>();
+            if (q_node.get_scale_shift_opt()) {
+                bool can_drop_input = false;
+
+                // Drop input range if clamp is not needed
+                can_drop_input |= (i == 1 || i == 2) && !q_node.get_need_clamp();
+                // Drop output range - it's not used in scale-shift-opt quantize kernel
+                can_drop_input |= i == 3 || i == 4;
+                // Drop tensor with input scale when we have per-tensor parameter
+                can_drop_input |= i == 5 && q_node.get_per_tensor_input_scale();
+                // Drop tensor with input shift when we have per-tensor parameter or it's not needed at all
+                can_drop_input |= i == 6 && (!q_node.get_need_pre_shift() || q_node.get_per_tensor_input_shift());
+                // Drop tensor with output scale when we have per-tensor parameter or it's not needed at all
+                can_drop_input |= i == 7 && (!q_node.get_need_post_scale() || q_node.get_per_tensor_output_scale());
+                // Drop tensor with output shift when we have per-tensor parameter or it's not needed at all
+                can_drop_input |= i == 8 && (!q_node.get_need_post_shift() || q_node.get_per_tensor_output_shift());
+
+                if (can_drop_input)
+                    continue;
+            }
+        }
        fused_node.dependencies.push_back(&dep);
        local_desc.deps.push_back(dep.id());
        dep.users.push_back(&fused_node);