[IE CLDNN] Remove unused fused deps for FQ (#712)
Remove unused fused FQ kernel arguments to avoid extra setArg() calls which significantly reduces host overhead
This commit is contained in:
committed by
GitHub
parent
546377dc8e
commit
28ffbf0857
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
// Copyright (c) 2019-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -117,7 +117,25 @@ struct quantize_fuse_params : fuse_params {
|
||||
, in_scale(in_scale)
|
||||
, in_shift(in_shift)
|
||||
, out_scale(out_scale)
|
||||
, out_shift(out_shift) { }
|
||||
, out_shift(out_shift) {
|
||||
size_t index = 0;
|
||||
if (has_clamp) {
|
||||
in_range_lo_idx = index++;
|
||||
in_range_hi_idx = index++;
|
||||
}
|
||||
if (!per_tensor_input_scale) {
|
||||
in_scale_idx = index++;
|
||||
}
|
||||
if (!per_tensor_input_shift && has_pre_shift) {
|
||||
in_shift_idx = index++;
|
||||
}
|
||||
if (!per_tensor_output_scale && has_post_scale) {
|
||||
out_scale_idx = index++;
|
||||
}
|
||||
if (!per_tensor_output_shift && has_post_shift) {
|
||||
out_shift_idx = index++;
|
||||
}
|
||||
}
|
||||
|
||||
bool scale_shift_opt;
|
||||
bool has_post_scale;
|
||||
@@ -137,6 +155,13 @@ struct quantize_fuse_params : fuse_params {
|
||||
float in_shift;
|
||||
float out_scale;
|
||||
float out_shift;
|
||||
|
||||
size_t in_range_lo_idx;
|
||||
size_t in_range_hi_idx;
|
||||
size_t in_scale_idx;
|
||||
size_t in_shift_idx;
|
||||
size_t out_scale_idx;
|
||||
size_t out_shift_idx;
|
||||
};
|
||||
|
||||
} // namespace kernel_selector
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
// Copyright (c) 2019-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -1155,26 +1155,32 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
|
||||
// We can't convert inputs to output data type, because it might be equal to UINT8 or INT8, so we convert the data
|
||||
// to the zero tensor's (input_lo) type
|
||||
std::string tmp_var = in_var;
|
||||
std::string tmp_type;
|
||||
std::string in_converted = in_var;
|
||||
if (in_type != desc.tensors[0].GetDType()) {
|
||||
tmp_type = GetType(desc.tensors[0].GetDType(), vec_size);
|
||||
tmp_var = out_var + "_tmp";
|
||||
Datatype tmp_type = desc.tensors.empty() ? in_type : desc.tensors[0].GetDType();
|
||||
std::string tmp_type_str = GetType(tmp_type, vec_size);
|
||||
std::string tmp_var = out_var + "_tmp";
|
||||
|
||||
if (in_type != tmp_type) {
|
||||
in_converted = ConvertToType(in_var, desc.tensors[0].GetDType(), vec_size);
|
||||
}
|
||||
|
||||
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(6);
|
||||
auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(7);
|
||||
auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(4);
|
||||
auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(5);
|
||||
auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(0);
|
||||
auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(1);
|
||||
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
|
||||
: GetInputVarName(p->out_scale_idx);
|
||||
auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), tmp_type, vec_size)
|
||||
: GetInputVarName(p->out_shift_idx);
|
||||
auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), tmp_type, vec_size)
|
||||
: GetInputVarName(p->in_scale_idx);
|
||||
auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), tmp_type, vec_size)
|
||||
: GetInputVarName(p->in_shift_idx);
|
||||
auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), tmp_type, vec_size)
|
||||
: GetInputVarName(p->in_range_lo_idx);
|
||||
auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), tmp_type, vec_size)
|
||||
: GetInputVarName(p->in_range_hi_idx);
|
||||
|
||||
if (p->has_clamp) {
|
||||
op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
|
||||
op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
|
||||
} else {
|
||||
op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = " + in_converted + ";";
|
||||
op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = " + in_converted + ";";
|
||||
}
|
||||
op_decls += "\\\n\t" + tmp_var + " = " + tmp_var + "*" + pre_scale + ";";
|
||||
if (p->has_pre_shift)
|
||||
@@ -1401,17 +1407,17 @@ std::vector<size_t> FusedOpsCodeGenerator::GetRequiredInputs() const {
|
||||
if (p) {
|
||||
std::vector<size_t> res = {};
|
||||
if (!p->per_tensor_input_range && p->has_clamp) {
|
||||
res.push_back(0);
|
||||
res.push_back(1);
|
||||
res.push_back(p->in_range_lo_idx);
|
||||
res.push_back(p->in_range_hi_idx);
|
||||
}
|
||||
if (!p->per_tensor_input_scale)
|
||||
res.push_back(4);
|
||||
res.push_back(p->in_scale_idx);
|
||||
if (p->has_pre_shift && !p->per_tensor_input_shift)
|
||||
res.push_back(5);
|
||||
res.push_back(p->in_shift_idx);
|
||||
if (p->has_post_scale && !p->per_tensor_output_scale)
|
||||
res.push_back(6);
|
||||
res.push_back(p->out_scale_idx);
|
||||
if (p->has_post_shift && !p->per_tensor_output_shift)
|
||||
res.push_back(7);
|
||||
res.push_back(p->out_shift_idx);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@@ -36,6 +36,7 @@
|
||||
#include "binary_convolution_inst.h"
|
||||
#include "resample_inst.h"
|
||||
#include "reshape_inst.h"
|
||||
#include "quantize_inst.h"
|
||||
#include "activation_inst.h"
|
||||
#include "scale_inst.h"
|
||||
#include "depth_to_space_inst.h"
|
||||
@@ -906,6 +907,29 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
|
||||
auto& dep = peer_node.get_dependency(i);
|
||||
if (dep.id() == fused_node.id())
|
||||
continue;
|
||||
|
||||
if (peer_node.is_type<quantize>()) {
|
||||
quantize_node& q_node = peer_node.as<quantize>();
|
||||
if (q_node.get_scale_shift_opt()) {
|
||||
bool can_drop_input = false;
|
||||
|
||||
// Drop input range if clamp is not needed
|
||||
can_drop_input |= (i == 1 || i == 2) && !q_node.get_need_clamp();
|
||||
// Drop output range - it's not used in scale-shift-opt quantize kernel
|
||||
can_drop_input |= i == 3 || i == 4;
|
||||
// Drop tensor with input scale when we have per-tensor parameter
|
||||
can_drop_input |= i == 5 && q_node.get_per_tensor_input_scale();
|
||||
// Drop tensor with input shift when we have per-tensor parameter or it's not needed at all
|
||||
can_drop_input |= i == 6 && (!q_node.get_need_pre_shift() || q_node.get_per_tensor_input_shift());
|
||||
// Drop tensor with output scale when we have per-tensor parameter or it's not needed at all
|
||||
can_drop_input |= i == 7 && (!q_node.get_need_post_scale() || q_node.get_per_tensor_output_scale());
|
||||
// Drop tensor with output shift when we have per-tensor parameter or it's not needed at all
|
||||
can_drop_input |= i == 8 && (!q_node.get_need_post_shift() || q_node.get_per_tensor_output_shift());
|
||||
|
||||
if (can_drop_input)
|
||||
continue;
|
||||
}
|
||||
}
|
||||
fused_node.dependencies.push_back(&dep);
|
||||
local_desc.deps.push_back(dep.id());
|
||||
dep.users.push_back(&fused_node);
|
||||
|
||||
Reference in New Issue
Block a user