[IE CLDNN] Extended eltwise fusing (#5181)
* [cldnn] Add initial fused conv eltw POC - Add cldnn unit test - Add fused dependency list to the fused_primitive_desc - fuse_nodes update for saving fusing history and depenecies - Modify Jitter to create jit constants using fused dependencies - Add cldnn unit-test cases for multiple serial and parallel eltwise fuse pattern - Modify Jitter and add default values in sum input Signed-off-by: Ahn, Paul Y <paul.y.ahn@intel.com> Co-authored-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * [cldnn] Update fused_conv_eltwise cldnn unit test - Add execute and compare function - Add cldnn unit-test case for multiple parallel eltwise and additional eltwise - Add cldnn unit-test case for combination of multiple parallel eltw - Add cldnn unit-test cases for serial and diverged quantize and eltwise Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * [cldnn] Modify checking fusibility of eltwise fusing - Add new checking fusibility rule in prepare_primitive_fusing - Move cldnn eltwise fusing test to fusing_gpu_test.cpp - Modify method to get input var name in jitter Signed-off-by: Ahn, Paul Y <paul.y.ahn@intel.com> * [cldnn] Fix fusing item type and activation fusibility checking condition - Extract input_data_supports_fusings from fuse_activaion_f - Fix checking supported mode bug Co-authored-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
This commit is contained in:
committed by
GitHub
parent
b47d11e31e
commit
29a8be523d
@@ -1476,7 +1476,7 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura
|
||||
|
||||
JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfiguration& conf,
|
||||
const std::string in_var, const Datatype in_type,
|
||||
std::string& out_var, Datatype& out_type) const {
|
||||
std::string& out_var) const {
|
||||
JitConstants jit = {};
|
||||
|
||||
std::string op_decls = "";
|
||||
@@ -1484,9 +1484,11 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
auto idx = conf.bfzyx_idx_order;
|
||||
std::string shuffle_var = conf.shuffle_var_name;
|
||||
bool is_shuffled = false;
|
||||
auto& fused_op_ids = desc.fused_op_ids;
|
||||
std::vector<std::string> input_vars;
|
||||
|
||||
out_var = GetOutputVarName(in_var);
|
||||
out_type = desc.output_tensor.GetDType();
|
||||
out_var = GetOutputVarName(in_var, desc.op_id);
|
||||
const auto& out_type = desc.output_tensor.GetDType();
|
||||
|
||||
if (conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE &&
|
||||
(desc.GetType() == KernelType::SCALE || desc.GetType() == KernelType::QUANTIZE)) {
|
||||
@@ -1503,15 +1505,19 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
}
|
||||
|
||||
auto get_acc_t = [&]() -> Datatype {
|
||||
std::vector<Datatype> tensor_types = {desc.output_tensor.GetDType()};
|
||||
std::vector<Datatype> input_types = {desc.output_tensor.GetDType()};
|
||||
for (auto& in : desc.tensors) {
|
||||
tensor_types.push_back(in.GetDType());
|
||||
input_types.push_back(in.GetDType());
|
||||
}
|
||||
|
||||
for (auto& in : fused_op_ids) {
|
||||
input_types.push_back(in.second);
|
||||
}
|
||||
|
||||
std::vector<Datatype> types_prioritized = { Datatype::F32, Datatype::F16 };
|
||||
|
||||
for (auto& type : types_prioritized) {
|
||||
if (std::any_of(tensor_types.begin(), tensor_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
|
||||
if (std::any_of(input_types.begin(), input_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
@@ -1520,25 +1526,42 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
};
|
||||
|
||||
auto get_input = [&](size_t index) -> std::string {
|
||||
auto in_name = index == 0 ? in_var : GetInputVarName(index - 1, is_shuffled, shuffle_var);
|
||||
auto tensor_type = index == 0 ? in_type : desc.tensors[index - 1].GetDType();
|
||||
auto input_name = in_var;
|
||||
auto input_type = in_type;
|
||||
if (index > 0) {
|
||||
size_t input_idx = index - 1;
|
||||
size_t tensors_len = desc.tensors.size();
|
||||
input_name = (input_idx < tensors_len)? GetInputVarName(input_idx, is_shuffled, shuffle_var)
|
||||
: GetOutputVarName(in_var, fused_op_ids[input_idx - tensors_len].first);
|
||||
input_type = (input_idx < tensors_len)? desc.tensors[input_idx].GetDType() : fused_op_ids[input_idx - tensors_len].second;
|
||||
}
|
||||
auto acc_t = get_acc_t();
|
||||
|
||||
if (tensor_type != acc_t)
|
||||
return ConvertToType(in_name, acc_t, vec_size);
|
||||
if (input_type != acc_t)
|
||||
return ConvertToType(input_name, acc_t, vec_size);
|
||||
else
|
||||
return in_name;
|
||||
return input_name;
|
||||
};
|
||||
|
||||
// Generate input variable list: dst + tensor inputs + fused ops input
|
||||
// If the input_vars_length are larger than max_num_input_vars, do not add dst to input variable list.
|
||||
// because dst is not used, when Fused op has both tensor and fused input.
|
||||
size_t input_vars_length = 1 + desc.tensors.size() + fused_op_ids.size(); // dst + tensor inputs + fused ops input
|
||||
size_t max_num_input_vars = (desc.tensors.size() > 1)? 3 : 2;
|
||||
size_t start_idx = (input_vars_length > max_num_input_vars) ? 1 : 0;
|
||||
for (size_t i = start_idx; i < input_vars_length; i++) {
|
||||
input_vars.push_back(get_input(i));
|
||||
}
|
||||
|
||||
switch (desc.GetType()) {
|
||||
case KernelType::SCALE: {
|
||||
auto tmp_var = out_var + "_tmp";
|
||||
if (desc.tensors.size() > 1) {
|
||||
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
|
||||
+ get_input(0) + " * " + get_input(1) + " + " + get_input(2) + ";";
|
||||
+ input_vars[0] + " * " + input_vars[1] + " + " + input_vars[2] + ";";
|
||||
} else {
|
||||
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
|
||||
+ get_input(0) + " * " + get_input(1) + ";";
|
||||
+ input_vars[0] + " * " + input_vars[1] + ";";
|
||||
}
|
||||
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
|
||||
break;
|
||||
@@ -1561,7 +1584,7 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
}
|
||||
|
||||
auto tmp_var = out_var + "_tmp";
|
||||
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + get_input(0) + op + get_input(1) + ";";
|
||||
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
|
||||
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
|
||||
break;
|
||||
}
|
||||
@@ -1570,13 +1593,14 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
if (!p)
|
||||
throw std::runtime_error("[clDNN] Quantize fuse params can't be nullptr");
|
||||
|
||||
std::string in_converted = in_var;
|
||||
std::string in_converted = (fused_op_ids.empty()) ? in_var : GetOutputVarName(in_var, fused_op_ids[0].first);
|
||||
Datatype input_type = (fused_op_ids.empty()) ? in_type : fused_op_ids[0].second;
|
||||
Datatype tmp_type = Datatype::F32;
|
||||
std::string tmp_type_str = GetType(tmp_type, vec_size);
|
||||
std::string tmp_var = out_var + "_tmp";
|
||||
|
||||
if (in_type != tmp_type) {
|
||||
in_converted = ConvertToType(in_var, tmp_type, vec_size);
|
||||
if (input_type != tmp_type) {
|
||||
in_converted = ConvertToType(in_converted, tmp_type, vec_size);
|
||||
}
|
||||
|
||||
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
|
||||
@@ -1618,7 +1642,9 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
case KernelType::ACTIVATION: {
|
||||
auto p = desc.GetOpParams<activation_fuse_params>();
|
||||
base_activation_params activation_p = p->param;
|
||||
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(in_var, vec_size) + ";";
|
||||
|
||||
std::string new_in_var = (fused_op_ids.empty()) ? in_var : GetOutputVarName(in_var, fused_op_ids[0].first);
|
||||
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(new_in_var, vec_size) + ";";
|
||||
if (activation_p.function != ActivationFunction::NONE) {
|
||||
auto suffix = "_FUSED_OP"+std::to_string(desc.op_id) + conf.suffix;
|
||||
std::string nl_m = std::to_string(activation_p.m);
|
||||
@@ -1784,12 +1810,12 @@ std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id, bool is_shuf
|
||||
return GetTypeStr() + std::to_string(desc.op_id) + "_data" + std::to_string(input_id);
|
||||
}
|
||||
|
||||
std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var) const {
|
||||
std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var, size_t op_id) const {
|
||||
std::replace(input_var.begin(), input_var.end(), '[', '_');
|
||||
std::replace(input_var.begin(), input_var.end(), ']', '_');
|
||||
std::replace(input_var.begin(), input_var.end(), ' ', '_');
|
||||
std::replace(input_var.begin(), input_var.end(), '.', '_');
|
||||
return input_var + "_out";
|
||||
return input_var + "_out_" + std::to_string(op_id);
|
||||
}
|
||||
|
||||
std::string FusedOpsCodeGenerator::GetType(Datatype dt, size_t vec_size) const {
|
||||
|
||||
@@ -340,7 +340,7 @@ public:
|
||||
JitConstants MakeLoadJitConstants(const FusedOpsConfiguration& conf, const DataTensor prim_output) const;
|
||||
JitConstants MakeOpJitConstants(const FusedOpsConfiguration& conf,
|
||||
const std::string in_var, const Datatype in_type,
|
||||
std::string& out_var, Datatype& out_type) const;
|
||||
std::string& out_var) const;
|
||||
|
||||
bool CanPreloadData(const FusedOpsConfiguration& conf) const;
|
||||
|
||||
@@ -353,7 +353,7 @@ public:
|
||||
std::string GetIdx(size_t input_id, idx_desc idx, bool should_be_safe) const;
|
||||
std::string GetInputPtrName(size_t input_id) const;
|
||||
std::string GetInputVarName(size_t input_id, bool is_shuffled = false, std::string shuffle_var = "") const;
|
||||
std::string GetOutputVarName(std::string input_var_name) const;
|
||||
std::string GetOutputVarName(std::string input_var_name, size_t op_id) const;
|
||||
std::string ConvertToOutputType(std::string var, size_t vec_size = 1) const;
|
||||
std::string ConvertToType(std::string var, Datatype dt, size_t vec_size = 1) const;
|
||||
std::string CastToType(std::string var, Datatype dt, size_t vec_size = 1) const;
|
||||
|
||||
@@ -115,17 +115,14 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
|
||||
std::string fused_ops_preload;
|
||||
std::string fused_ops_calc;
|
||||
std::string in_name = c.input_var_name;
|
||||
std::string out_name = "";
|
||||
Datatype in_type = c.input_dt;
|
||||
bool can_all_use_preload = true;
|
||||
|
||||
for (size_t i = 0; i < params.fused_ops.size(); i++) {
|
||||
auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
|
||||
std::string out_var;
|
||||
Datatype out_type;
|
||||
jit.Merge(fused_dep_codegen.MakeLoadJitConstants(c, params.output));
|
||||
jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_var, out_type));
|
||||
in_name = out_var;
|
||||
in_type = out_type;
|
||||
jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_name));
|
||||
|
||||
bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
|
||||
can_all_use_preload &= can_use_preload;
|
||||
@@ -145,7 +142,7 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OPS" + c.suffix, fused_ops));
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OPS_PRELOAD" + c.suffix, fused_ops_preload));
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc));
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, in_name));
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, out_name));
|
||||
|
||||
bool can_any_use_preload = !fused_ops_preload.empty();
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix,
|
||||
|
||||
@@ -558,6 +558,7 @@ struct fused_operation_desc {
|
||||
MultiDataTensor tensors;
|
||||
DataTensor output_tensor;
|
||||
size_t op_id;
|
||||
std::vector<std::pair<size_t, Datatype>> fused_op_ids;
|
||||
|
||||
// Helper functions for operation generation
|
||||
KernelType GetType() const { return op_params->GetType(); }
|
||||
|
||||
@@ -45,10 +45,12 @@
|
||||
#include "extract_image_patches_inst.h"
|
||||
#include "reduce_inst.h"
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <deque>
|
||||
#include "error_handler.h"
|
||||
|
||||
void prepare_primitive_fusing::run(program_impl& p) {
|
||||
@@ -164,12 +166,13 @@ void prepare_primitive_fusing::fuse_reorders(program_impl &p) {
|
||||
|
||||
void prepare_primitive_fusing::fuse_activations(program_impl &p) {
|
||||
bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
|
||||
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
|
||||
auto itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
auto node_itr = itr++;
|
||||
auto& node = (*node_itr);
|
||||
|
||||
program_helpers::do_for_types<activation>(*node, [&p, &is_debug](activation_node& node) {
|
||||
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history](activation_node& node) {
|
||||
auto& input = node.input();
|
||||
auto id = node.id();
|
||||
// Restrictions:
|
||||
@@ -226,7 +229,7 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
|
||||
} else {
|
||||
// If node already has any fused node using new mechanism,
|
||||
// we can just use the same way and handle any amount of activations
|
||||
p.fuse_nodes(input, node);
|
||||
p.fuse_nodes(input, node, &fusing_history);
|
||||
}
|
||||
|
||||
p.add_optimized_primitive_info(id, {input.id()});
|
||||
@@ -350,6 +353,7 @@ void prepare_primitive_fusing::fuse_bias(program_impl &p) {
|
||||
|
||||
void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
bool recalc_processing_order = false;
|
||||
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
|
||||
|
||||
auto itr = p.get_processing_order().begin();
|
||||
while (itr != p.get_processing_order().end()) {
|
||||
@@ -497,9 +501,63 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
return true;
|
||||
};
|
||||
|
||||
auto get_users_from_fusing_history = [&](primitive_id id) {
|
||||
std::vector<primitive_id> users;
|
||||
for (auto deps_data : fusing_history) {
|
||||
auto key = deps_data.first;
|
||||
auto deps_vec = deps_data.second;
|
||||
auto iter = std::find(deps_vec.begin(), deps_vec.end(), id);
|
||||
if (iter != deps_vec.end()) {
|
||||
users.push_back(key);
|
||||
}
|
||||
}
|
||||
return users;
|
||||
};
|
||||
|
||||
auto input_data_supports_fusings = [&](cldnn::program_node& input_data, primitive_id current_node_id) -> bool {
|
||||
if (input_data.get_users().size() != 1) {
|
||||
// If input_data has fused primitives,
|
||||
// find original dependency of current_node using fusing_history
|
||||
// and check the number of users of it.
|
||||
// If the node has multiple users it's not fusible.
|
||||
if (input_data.has_fused_primitives()) {
|
||||
size_t num_original_dependencies = 0;
|
||||
auto iter = fusing_history.find(current_node_id);
|
||||
if (iter != fusing_history.end()) {
|
||||
// Find current_node's original dependency list
|
||||
for (auto& prim_id : iter->second) {
|
||||
// find input_data's fused_prims in the prim_deps_ids
|
||||
auto& fused_descs = input_data.get_fused_primitives();
|
||||
auto origin_input_iter = std::find_if(fused_descs.begin(), fused_descs.end(),
|
||||
[&](cldnn::fused_primitive_desc& desc) {
|
||||
return (desc.node->id() == prim_id);
|
||||
});
|
||||
if (origin_input_iter != fused_descs.end()) {
|
||||
auto users = get_users_from_fusing_history(origin_input_iter->node->id());
|
||||
if (users.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
num_original_dependencies++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If num_original_dependencies is zero, input_data is original parent
|
||||
if (num_original_dependencies == 0) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
auto fuse_activation_f = [&](activation_node& activation_node) {
|
||||
auto& input_data = activation_node.get_dependency(0);
|
||||
if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
|
||||
if (activation_node.get_dependencies().size() >= 3)
|
||||
return;
|
||||
|
||||
if (!input_data_supports_fusings(input_data, activation_node.id()))
|
||||
return;
|
||||
|
||||
bool should_fuse = input_data.is_type<binary_convolution>();
|
||||
@@ -558,7 +616,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
if (!should_fuse)
|
||||
return;
|
||||
|
||||
p.fuse_nodes(input_data, activation_node);
|
||||
p.fuse_nodes(input_data, activation_node, &fusing_history);
|
||||
};
|
||||
|
||||
auto fuse_scale_f = [&](scale_node& scale_node) {
|
||||
@@ -623,7 +681,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
if (!should_fuse)
|
||||
return;
|
||||
|
||||
p.fuse_nodes(input_data, scale_node);
|
||||
p.fuse_nodes(input_data, scale_node, &fusing_history);
|
||||
};
|
||||
|
||||
auto fuse_quantize_f = [&](quantize_node& quantize_node) {
|
||||
@@ -717,7 +775,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
if (!should_fuse)
|
||||
return;
|
||||
|
||||
p.fuse_nodes(input_data, quantize_node);
|
||||
p.fuse_nodes(input_data, quantize_node, &fusing_history);
|
||||
};
|
||||
|
||||
auto fuse_eltwise_f = [&](eltwise_node& node) {
|
||||
@@ -811,8 +869,83 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
if (parent2->is_type<convolution>() && !conv_supports_fusings(parent2->as<convolution>()))
|
||||
return;
|
||||
|
||||
// This fusing can be extended to support peer node in any layout
|
||||
bool merge_allowed = fused_node->get_users().size() == 1;
|
||||
bool merge_allowed = true;
|
||||
// If fused node is not convolution and fused node has multiple users,
|
||||
// follow the legacy checking rule
|
||||
if (fused_node->is_type<convolution>() && fused_node->get_users().size() > 1) {
|
||||
// Allowed new pattern: Elt1, Act, Elt2, Elt3, Elt4 are fused to Conv1
|
||||
// * Conv1 -> Eltw1(Add) -> Act(Clamp) -> Eltw2(Mul) -> Eltw3(Mul) -> Eltw4(Add) -> Conv2
|
||||
// * \–----------------------------------->/ \---------> Eltw5(Div)
|
||||
//
|
||||
// Extended eltwise fusiblity checking rules
|
||||
//
|
||||
// 1. All fusing nodes should be eltwise or activation node
|
||||
// 2. All intermediate fusing nodes except last fusing node(i.e. Elt4) should have only eltwise or activation node as user.
|
||||
// 3. Currently eltwise and activations are allowed to be fused from multiple branches,
|
||||
// but technically other fusable operations can be allowed too in the future.
|
||||
// 4. When node_queue has only one node, the while loop is ended and this node is fused to fused node(Conv1)
|
||||
// node_queue having one node means all user nodes from fused node(Conv1) converge at that node.
|
||||
// 5. if node_queue has multiple nodes even if the level of current_node is max_levels, it cannot be fused.
|
||||
std::deque<std::pair<cldnn::program_node*, size_t>> node_queue; //std::pair<cldnn::program_node*, layer level>
|
||||
std::vector<cldnn::program_node*> node_history;
|
||||
node_queue.push_back(std::make_pair(fused_node, 0));
|
||||
|
||||
const uint8_t max_levels = 5;
|
||||
do {
|
||||
// Pop the current node from node_queue
|
||||
// Add the current node to the node_history to verfiy the trace of checking
|
||||
auto current_node = node_queue.front();
|
||||
node_queue.pop_front();
|
||||
if (std::find(node_history.begin(), node_history.end(), current_node.first) == node_history.end()) {
|
||||
node_history.push_back(current_node.first);
|
||||
}
|
||||
|
||||
if (current_node.second > max_levels) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Push node to node_queue
|
||||
// If the node is already existed in node_queue, do not add it to the node_queue.
|
||||
auto push_node_queue = [&](cldnn::program_node* in_node, size_t level) {
|
||||
auto iter = std::find_if(node_queue.begin(), node_queue.end(), [&](std::pair<cldnn::program_node*, size_t> element) {
|
||||
return (in_node->id() == element.first->id());
|
||||
});
|
||||
if (iter == node_queue.end()) {
|
||||
node_queue.push_back(std::make_pair(in_node, level));
|
||||
}
|
||||
};
|
||||
|
||||
// If the any user node is not eltwise(mul / add mode) and activation,
|
||||
// the current node will be considered as last node and put it back into the node_queue
|
||||
auto curr_users = current_node.first->get_users();
|
||||
auto invalid_user_iter = std::find_if(curr_users.begin(), curr_users.end(), [&](cldnn::program_node* user) {
|
||||
return (user->is_output() ||
|
||||
(!(user->is_type<eltwise>() && user->get_primitive()->input.size() == 2 &&
|
||||
(std::find(supported_modes.begin(), supported_modes.end(),
|
||||
(user->as<eltwise>()).get_primitive()->mode) != supported_modes.end())) &&
|
||||
!(user->is_type<activation>() && user->get_primitive()->input.size() == 1)));
|
||||
});
|
||||
|
||||
if (invalid_user_iter != curr_users.end()) {
|
||||
// If fused_node(i.e. Conv1) have invalid user node(that is not activation and eltwise ndoe), it cannot be fused
|
||||
if (fused_node->id() == current_node.first->id()) {
|
||||
return;
|
||||
}
|
||||
push_node_queue(current_node.first, (current_node.second+1));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add user node in current node to the queue
|
||||
// But, do not add the node that passed once, it is checked using node_history
|
||||
for (auto& user : curr_users) {
|
||||
auto iter = std::find(node_history.begin(), node_history.end(), user);
|
||||
if (iter == node_history.end())
|
||||
push_node_queue(user, current_node.second+1);
|
||||
}
|
||||
} while (node_queue.size() > 1);
|
||||
} else {
|
||||
merge_allowed = fused_node->get_users().size() == 1;
|
||||
}
|
||||
|
||||
for (auto& parent : fused_node->get_dependencies())
|
||||
if (parent->id() == peer_node->id())
|
||||
@@ -831,7 +964,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
|
||||
recalc_processing_order = true;
|
||||
}
|
||||
|
||||
p.fuse_nodes(*fused_node, node);
|
||||
p.fuse_nodes(*fused_node, node, &fusing_history);
|
||||
};
|
||||
|
||||
program_helpers::do_for_types<activation, scale, quantize, eltwise>(*node,
|
||||
@@ -861,6 +994,22 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
|
||||
// 2. fuse conv bias to quantize shift
|
||||
auto& fused_prims = node->get_fused_primitives();
|
||||
|
||||
auto remove_deps_of_node = [&](cldnn::fused_primitive_desc& desc) {
|
||||
for (auto& prim : fused_prims) {
|
||||
if (desc.node->id() == prim.node->id()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto rm_iter = std::find_if(prim.fused_deps.begin(), prim.fused_deps.end(), [&](primitive_id& dep_id){
|
||||
return (desc.node->id() == dep_id);
|
||||
});
|
||||
if (rm_iter != prim.fused_deps.end()) {
|
||||
prim.fused_deps.erase(rm_iter);
|
||||
prim.fused_deps.insert(prim.fused_deps.end(), desc.fused_deps.begin(), desc.fused_deps.end());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Drop relu if the next fused op is quantize with u8 output and no in_shift
|
||||
auto fp_itr = fused_prims.begin();
|
||||
while (fp_itr != fused_prims.end()) {
|
||||
@@ -883,6 +1032,7 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
|
||||
!quantize_node.get_need_pre_shift();
|
||||
|
||||
if (can_skip) {
|
||||
remove_deps_of_node(fp);
|
||||
fp_itr = fused_prims.erase(curr_itr);
|
||||
}
|
||||
}
|
||||
@@ -891,6 +1041,7 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
|
||||
}
|
||||
|
||||
void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program_node* node) {
|
||||
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
|
||||
// make sure this convolution have only 1 user and it's depth_to_space
|
||||
// make sure convolution is not an output
|
||||
if (node->get_users().size() != 1 || node->is_output())
|
||||
@@ -919,7 +1070,7 @@ void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program
|
||||
return;
|
||||
}
|
||||
|
||||
p.fuse_nodes(*conv_node, *d_t_s_node);
|
||||
p.fuse_nodes(*conv_node, *d_t_s_node, &fusing_history);
|
||||
}
|
||||
|
||||
void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* node) {
|
||||
|
||||
@@ -159,6 +159,7 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) {
|
||||
params.layerID = arg.id();
|
||||
|
||||
convert_fused_activation_func_params(arg, params.activations);
|
||||
std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_op_id_map;
|
||||
size_t op_id = 0;
|
||||
for (auto& fused_prim : arg.get_fused_primitives()) {
|
||||
kernel_selector::fused_operation_desc desc;
|
||||
@@ -171,6 +172,13 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) {
|
||||
desc.dep_size = fused_prim.deps.size();
|
||||
desc.op_id = op_id++;
|
||||
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
|
||||
prim_op_id_map[fused_prim.node->id()] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
|
||||
for (auto& dep : fused_prim.fused_deps) {
|
||||
auto iter = prim_op_id_map.find(dep);
|
||||
if (iter != prim_op_id_map.end()) {
|
||||
desc.fused_op_ids.push_back(iter->second);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
|
||||
desc.tensors.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout()));
|
||||
|
||||
@@ -186,7 +186,7 @@ public:
|
||||
bool extract_and_remove(program_node& node);
|
||||
|
||||
// Fuses two nodes into fused_node and removes peer_node from graph
|
||||
void fuse_nodes(program_node& fused_node, program_node& peer_node);
|
||||
void fuse_nodes(program_node& fused_node, program_node& peer_node, std::map<primitive_id, std::vector<primitive_id>>* fusing_history);
|
||||
|
||||
// returns if 'node' has been removed
|
||||
bool remove_if_dangling(program_node& node);
|
||||
|
||||
@@ -40,6 +40,7 @@ struct fused_primitive_desc {
|
||||
std::shared_ptr<program_node> node;
|
||||
size_t dep_start_idx;
|
||||
std::vector<primitive_id> deps;
|
||||
std::vector<primitive_id> fused_deps;
|
||||
activation_func activation;
|
||||
activation_additional_params activation_params;
|
||||
layout output_layout = layout(data_types::f32, format::bfyx, tensor());
|
||||
|
||||
@@ -895,7 +895,7 @@ bool program_impl::extract_and_remove(program_node& node) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node) {
|
||||
void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node, std::map<primitive_id, std::vector<primitive_id>>* fusing_history) {
|
||||
auto peer_layout = peer_node.get_output_layout();
|
||||
fused_primitive_desc local_desc;
|
||||
local_desc.node = get_node_ptr(peer_node.id());
|
||||
@@ -913,6 +913,13 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
|
||||
cldnn::padding needed_padding = padding::max(peer_layout.data_padding,
|
||||
fused_node.get_output_layout().data_padding);
|
||||
|
||||
auto history_iter = fusing_history->find(peer_node.id());
|
||||
if (history_iter != fusing_history->end()) {
|
||||
for (auto& id : history_iter->second) {
|
||||
local_desc.fused_deps.push_back(id);
|
||||
}
|
||||
}
|
||||
|
||||
// Add new dependencies to the fused_node
|
||||
for (size_t i = 0; i < peer_node.get_dependencies().size(); i++) {
|
||||
auto& dep = peer_node.get_dependency(i);
|
||||
@@ -952,6 +959,10 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
|
||||
}
|
||||
add_optimized_primitive_info(peer_node.id(), { fused_node.id() });
|
||||
|
||||
for (auto& user : peer_node.users) {
|
||||
(*fusing_history)[user->id()].push_back(peer_node.id());
|
||||
}
|
||||
|
||||
// Remove all edges connected with peer node
|
||||
while (peer_node.get_dependencies().size() > 0) {
|
||||
auto& dep = peer_node.get_dependency(peer_node.get_dependencies().size() - 1);
|
||||
|
||||
@@ -323,6 +323,8 @@ public:
|
||||
return layout{ p.data_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1} };
|
||||
}
|
||||
|
||||
|
||||
|
||||
layout get_single_element_layout(T& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{1, 1, 1, 1} };
|
||||
}
|
||||
@@ -831,6 +833,212 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_prelu_eltwise,
|
||||
bc_test_params{CASE_CONV_FP16_4, 2, 4},
|
||||
}), );
|
||||
|
||||
|
||||
class conv_fp32_multi_eltwise_2 : public ConvFusingTest {};
|
||||
TEST_P(conv_fp32_multi_eltwise_2, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("eltwise_data", get_mem(get_output_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("eltwise1", "conv_prim", "eltwise_data", eltwise_mode::sum),
|
||||
eltwise("eltwise2", "eltwise1", "conv_prim", eltwise_mode::prod),
|
||||
reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
|
||||
);
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_2,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_FP32_2, 2, 4},
|
||||
bc_test_params{CASE_CONV_FP32_3, 2, 4},
|
||||
bc_test_params{CASE_CONV_FP32_4, 2, 4},
|
||||
|
||||
bc_test_params{CASE_CONV_FP16_2, 2, 4},
|
||||
bc_test_params{CASE_CONV_FP16_3, 2, 4},
|
||||
bc_test_params{CASE_CONV_FP16_4, 2, 4},
|
||||
}), );
|
||||
|
||||
|
||||
class conv_fp32_multi_eltwise_2_clamp : public ConvFusingTest {};
|
||||
|
||||
TEST_P(conv_fp32_multi_eltwise_2_clamp, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("eltwise1_data", get_mem(get_output_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("eltwise1", "conv_prim", "eltwise1_data", eltwise_mode::sum),
|
||||
activation("activation", "eltwise1", activation_func::clamp, {0.5f, 2.5f}),
|
||||
eltwise("eltwise2", "activation", "conv_prim", eltwise_mode::prod),
|
||||
reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
|
||||
);
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_2_clamp,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_FP32_2, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP32_3, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP32_4, 2, 5},
|
||||
|
||||
bc_test_params{CASE_CONV_FP16_2, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP16_3, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP16_4, 2, 5},
|
||||
}), );
|
||||
|
||||
|
||||
class conv_fp32_multi_eltwise_4_clamp : public ConvFusingTest {};
|
||||
|
||||
TEST_P(conv_fp32_multi_eltwise_4_clamp, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("eltwise1_data", get_mem(get_output_layout(p))),
|
||||
data("eltwise2_data", get_mem(get_output_layout(p))),
|
||||
data("eltwise4_data", get_mem(get_output_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("eltwise1_add", "conv_prim", "eltwise1_data", eltwise_mode::sum),
|
||||
activation("activation", "eltwise1_add", activation_func::clamp, {0.5f, 2.5f}),
|
||||
eltwise("eltwise2_mul", "activation", "conv_prim", eltwise_mode::prod),
|
||||
eltwise("eltwise3_div", "eltwise2_mul", "eltwise2_data", eltwise_mode::prod),
|
||||
eltwise("eltwise4_add", "eltwise3_div", "eltwise4_data", eltwise_mode::sum),
|
||||
reorder("reorder_bfyx", "eltwise4_add", p.default_format, data_types::f32)
|
||||
);
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_4_clamp,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_FP32_2, 2, 7},
|
||||
bc_test_params{CASE_CONV_FP32_3, 2, 7},
|
||||
bc_test_params{CASE_CONV_FP32_4, 2, 7},
|
||||
|
||||
bc_test_params{CASE_CONV_FP16_2, 2, 7},
|
||||
bc_test_params{CASE_CONV_FP16_3, 2, 7},
|
||||
bc_test_params{CASE_CONV_FP16_4, 2, 7},
|
||||
}), );
|
||||
|
||||
|
||||
class conv_fp32_multi_eltwise_3_fusing : public ConvFusingTest {};
|
||||
TEST_P(conv_fp32_multi_eltwise_3_fusing, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("eltwise_data1", get_mem(get_output_layout(p))),
|
||||
data("eltwise_data2", get_mem(get_output_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
|
||||
eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum),
|
||||
eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod),
|
||||
reorder("reorder_bfyx", "eltwise3", p.default_format, data_types::f32)
|
||||
);
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_3_fusing,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_FP32_2, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP32_3, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP32_4, 2, 5},
|
||||
|
||||
bc_test_params{CASE_CONV_FP16_2, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP16_3, 2, 5},
|
||||
bc_test_params{CASE_CONV_FP16_4, 2, 5},
|
||||
}), );
|
||||
|
||||
|
||||
|
||||
class conv_fp32_multi_eltwise_quantization : public ConvFusingTest {};
|
||||
TEST_P(conv_fp32_multi_eltwise_quantization, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -127)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("eltwise_data1", get_mem(get_output_layout(p))),
|
||||
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
|
||||
eltwise("eltwise2", "eltwise1", "quantize", eltwise_mode::prod),
|
||||
reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
|
||||
);
|
||||
tolerance = 1.f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_quantization,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_FP32_2, 4, 5},
|
||||
bc_test_params{CASE_CONV_FP32_4, 4, 5},
|
||||
|
||||
bc_test_params{CASE_CONV_FP16_2, 4, 5},
|
||||
bc_test_params{CASE_CONV_FP16_3, 4, 5},
|
||||
bc_test_params{CASE_CONV_FP16_4, 4, 5},
|
||||
}), );
|
||||
|
||||
|
||||
class conv_fp32_multi_eltwise_concat : public ConvFusingTest {};
|
||||
TEST_P(conv_fp32_multi_eltwise_concat, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("eltwise_data1", get_mem(get_output_layout(p))),
|
||||
data("eltwise_data2", get_mem(get_output_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
|
||||
eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum),
|
||||
concatenation("concat",
|
||||
{"eltwise1", "eltwise2"},
|
||||
concatenation::concatenation_axis::along_f,
|
||||
data_types::i8,
|
||||
padding{{0, 0, 0, 0}, 0}),
|
||||
reorder("reorder_bfyx", "concat", p.default_format, data_types::f32)
|
||||
);
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_concat,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_FP32_2, 5, 5},
|
||||
bc_test_params{CASE_CONV_FP32_3, 5, 5},
|
||||
bc_test_params{CASE_CONV_FP32_4, 5, 5},
|
||||
|
||||
bc_test_params{CASE_CONV_FP16_2, 5, 5},
|
||||
bc_test_params{CASE_CONV_FP16_3, 5, 5},
|
||||
bc_test_params{CASE_CONV_FP16_4, 5, 5},
|
||||
}), );
|
||||
|
||||
class conv_fp32_eltwise_b_fs_zyx_fsv16 : public ConvFusingTest {};
|
||||
|
||||
TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) {
|
||||
|
||||
Reference in New Issue
Block a user