[IE CLDNN] Remove unused fused deps for FQ (#712)

Remove unused fused FQ kernel arguments to avoid extra setArg() calls which significantly reduces host overhead
This commit is contained in:
Vladimir Paramuzov
2020-06-04 10:30:46 +03:00
committed by GitHub
parent 546377dc8e
commit 28ffbf0857
3 changed files with 78 additions and 23 deletions

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -117,7 +117,25 @@ struct quantize_fuse_params : fuse_params {
, in_scale(in_scale)
, in_shift(in_shift)
, out_scale(out_scale)
, out_shift(out_shift) { }
, out_shift(out_shift) {
size_t index = 0;
if (has_clamp) {
in_range_lo_idx = index++;
in_range_hi_idx = index++;
}
if (!per_tensor_input_scale) {
in_scale_idx = index++;
}
if (!per_tensor_input_shift && has_pre_shift) {
in_shift_idx = index++;
}
if (!per_tensor_output_scale && has_post_scale) {
out_scale_idx = index++;
}
if (!per_tensor_output_shift && has_post_shift) {
out_shift_idx = index++;
}
}
bool scale_shift_opt;
bool has_post_scale;
@@ -137,6 +155,13 @@ struct quantize_fuse_params : fuse_params {
float in_shift;
float out_scale;
float out_shift;
size_t in_range_lo_idx;
size_t in_range_hi_idx;
size_t in_scale_idx;
size_t in_shift_idx;
size_t out_scale_idx;
size_t out_shift_idx;
};
} // namespace kernel_selector

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2019 Intel Corporation
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -1155,26 +1155,32 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
// We can't convert inputs to output data type, because it might be equal to UINT8 or INT8, so we convert the data
// to the zero tensor's (input_lo) type
std::string tmp_var = in_var;
std::string tmp_type;
std::string in_converted = in_var;
if (in_type != desc.tensors[0].GetDType()) {
tmp_type = GetType(desc.tensors[0].GetDType(), vec_size);
tmp_var = out_var + "_tmp";
Datatype tmp_type = desc.tensors.empty() ? in_type : desc.tensors[0].GetDType();
std::string tmp_type_str = GetType(tmp_type, vec_size);
std::string tmp_var = out_var + "_tmp";
if (in_type != tmp_type) {
in_converted = ConvertToType(in_var, desc.tensors[0].GetDType(), vec_size);
}
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(6);
auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(7);
auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(4);
auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(5);
auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(0);
auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(1);
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
: GetInputVarName(p->out_scale_idx);
auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), tmp_type, vec_size)
: GetInputVarName(p->out_shift_idx);
auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), tmp_type, vec_size)
: GetInputVarName(p->in_scale_idx);
auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), tmp_type, vec_size)
: GetInputVarName(p->in_shift_idx);
auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), tmp_type, vec_size)
: GetInputVarName(p->in_range_lo_idx);
auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), tmp_type, vec_size)
: GetInputVarName(p->in_range_hi_idx);
if (p->has_clamp) {
op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
} else {
op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = " + in_converted + ";";
op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = " + in_converted + ";";
}
op_decls += "\\\n\t" + tmp_var + " = " + tmp_var + "*" + pre_scale + ";";
if (p->has_pre_shift)
@@ -1401,17 +1407,17 @@ std::vector<size_t> FusedOpsCodeGenerator::GetRequiredInputs() const {
if (p) {
std::vector<size_t> res = {};
if (!p->per_tensor_input_range && p->has_clamp) {
res.push_back(0);
res.push_back(1);
res.push_back(p->in_range_lo_idx);
res.push_back(p->in_range_hi_idx);
}
if (!p->per_tensor_input_scale)
res.push_back(4);
res.push_back(p->in_scale_idx);
if (p->has_pre_shift && !p->per_tensor_input_shift)
res.push_back(5);
res.push_back(p->in_shift_idx);
if (p->has_post_scale && !p->per_tensor_output_scale)
res.push_back(6);
res.push_back(p->out_scale_idx);
if (p->has_post_shift && !p->per_tensor_output_shift)
res.push_back(7);
res.push_back(p->out_shift_idx);
return res;
}

View File

@@ -1,5 +1,5 @@
/*
// Copyright (c) 2016-2019 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
#include "binary_convolution_inst.h"
#include "resample_inst.h"
#include "reshape_inst.h"
#include "quantize_inst.h"
#include "activation_inst.h"
#include "scale_inst.h"
#include "depth_to_space_inst.h"
@@ -906,6 +907,29 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
auto& dep = peer_node.get_dependency(i);
if (dep.id() == fused_node.id())
continue;
if (peer_node.is_type<quantize>()) {
quantize_node& q_node = peer_node.as<quantize>();
if (q_node.get_scale_shift_opt()) {
bool can_drop_input = false;
// Drop input range if clamp is not needed
can_drop_input |= (i == 1 || i == 2) && !q_node.get_need_clamp();
// Drop output range - it's not used in scale-shift-opt quantize kernel
can_drop_input |= i == 3 || i == 4;
// Drop tensor with input scale when we have per-tensor parameter
can_drop_input |= i == 5 && q_node.get_per_tensor_input_scale();
// Drop tensor with input shift when we have per-tensor parameter or it's not needed at all
can_drop_input |= i == 6 && (!q_node.get_need_pre_shift() || q_node.get_per_tensor_input_shift());
// Drop tensor with output scale when we have per-tensor parameter or it's not needed at all
can_drop_input |= i == 7 && (!q_node.get_need_post_scale() || q_node.get_per_tensor_output_scale());
// Drop tensor with output shift when we have per-tensor parameter or it's not needed at all
can_drop_input |= i == 8 && (!q_node.get_need_post_shift() || q_node.get_per_tensor_output_shift());
if (can_drop_input)
continue;
}
}
fused_node.dependencies.push_back(&dep);
local_desc.deps.push_back(dep.id());
dep.users.push_back(&fused_node);