[IE CLDNN] Extended eltwise fusing (#5181)

* [cldnn] Add initial fused conv eltw POC

- Add cldnn unit test
- Add fused dependency list to the fused_primitive_desc
- fuse_nodes update for saving fusing history and depenecies
- Modify Jitter to create jit constants using fused dependencies
- Add cldnn unit-test cases for multiple serial and parallel eltwise fuse pattern
- Modify Jitter and add default values in sum input

Signed-off-by: Ahn, Paul Y <paul.y.ahn@intel.com>

Co-authored-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>

* [cldnn] Update fused_conv_eltwise cldnn unit test

- Add execute and compare function
- Add cldnn unit-test case for multiple parallel eltwise and additional eltwise
- Add cldnn unit-test case for combination of multiple parallel eltw
- Add cldnn unit-test cases for serial and diverged quantize and eltwise

Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>

* [cldnn] Modify checking fusibility of eltwise fusing

- Add new checking fusibility rule in prepare_primitive_fusing
- Move cldnn eltwise fusing test to fusing_gpu_test.cpp
- Modify method to get input var name in  jitter

Signed-off-by: Ahn, Paul Y <paul.y.ahn@intel.com>

* [cldnn] Fix fusing item type and activation fusibility checking condition
- Extract input_data_supports_fusings from fuse_activaion_f
- Fix checking supported mode bug

Co-authored-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
This commit is contained in:
Paul Youngsoo Ahn
2021-05-04 15:57:06 +09:00
committed by GitHub
parent b47d11e31e
commit 29a8be523d
10 changed files with 443 additions and 40 deletions

View File

@@ -1476,7 +1476,7 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura
JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfiguration& conf,
const std::string in_var, const Datatype in_type,
std::string& out_var, Datatype& out_type) const {
std::string& out_var) const {
JitConstants jit = {};
std::string op_decls = "";
@@ -1484,9 +1484,11 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
auto idx = conf.bfzyx_idx_order;
std::string shuffle_var = conf.shuffle_var_name;
bool is_shuffled = false;
auto& fused_op_ids = desc.fused_op_ids;
std::vector<std::string> input_vars;
out_var = GetOutputVarName(in_var);
out_type = desc.output_tensor.GetDType();
out_var = GetOutputVarName(in_var, desc.op_id);
const auto& out_type = desc.output_tensor.GetDType();
if (conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE &&
(desc.GetType() == KernelType::SCALE || desc.GetType() == KernelType::QUANTIZE)) {
@@ -1503,15 +1505,19 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
}
auto get_acc_t = [&]() -> Datatype {
std::vector<Datatype> tensor_types = {desc.output_tensor.GetDType()};
std::vector<Datatype> input_types = {desc.output_tensor.GetDType()};
for (auto& in : desc.tensors) {
tensor_types.push_back(in.GetDType());
input_types.push_back(in.GetDType());
}
for (auto& in : fused_op_ids) {
input_types.push_back(in.second);
}
std::vector<Datatype> types_prioritized = { Datatype::F32, Datatype::F16 };
for (auto& type : types_prioritized) {
if (std::any_of(tensor_types.begin(), tensor_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
if (std::any_of(input_types.begin(), input_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
return type;
}
}
@@ -1520,25 +1526,42 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
};
auto get_input = [&](size_t index) -> std::string {
auto in_name = index == 0 ? in_var : GetInputVarName(index - 1, is_shuffled, shuffle_var);
auto tensor_type = index == 0 ? in_type : desc.tensors[index - 1].GetDType();
auto input_name = in_var;
auto input_type = in_type;
if (index > 0) {
size_t input_idx = index - 1;
size_t tensors_len = desc.tensors.size();
input_name = (input_idx < tensors_len)? GetInputVarName(input_idx, is_shuffled, shuffle_var)
: GetOutputVarName(in_var, fused_op_ids[input_idx - tensors_len].first);
input_type = (input_idx < tensors_len)? desc.tensors[input_idx].GetDType() : fused_op_ids[input_idx - tensors_len].second;
}
auto acc_t = get_acc_t();
if (tensor_type != acc_t)
return ConvertToType(in_name, acc_t, vec_size);
if (input_type != acc_t)
return ConvertToType(input_name, acc_t, vec_size);
else
return in_name;
return input_name;
};
// Generate input variable list: dst + tensor inputs + fused ops input
// If the input_vars_length are larger than max_num_input_vars, do not add dst to input variable list.
// because dst is not used, when Fused op has both tensor and fused input.
size_t input_vars_length = 1 + desc.tensors.size() + fused_op_ids.size(); // dst + tensor inputs + fused ops input
size_t max_num_input_vars = (desc.tensors.size() > 1)? 3 : 2;
size_t start_idx = (input_vars_length > max_num_input_vars) ? 1 : 0;
for (size_t i = start_idx; i < input_vars_length; i++) {
input_vars.push_back(get_input(i));
}
switch (desc.GetType()) {
case KernelType::SCALE: {
auto tmp_var = out_var + "_tmp";
if (desc.tensors.size() > 1) {
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
+ get_input(0) + " * " + get_input(1) + " + " + get_input(2) + ";";
+ input_vars[0] + " * " + input_vars[1] + " + " + input_vars[2] + ";";
} else {
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
+ get_input(0) + " * " + get_input(1) + ";";
+ input_vars[0] + " * " + input_vars[1] + ";";
}
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
break;
@@ -1561,7 +1584,7 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
}
auto tmp_var = out_var + "_tmp";
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + get_input(0) + op + get_input(1) + ";";
op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
break;
}
@@ -1570,13 +1593,14 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
if (!p)
throw std::runtime_error("[clDNN] Quantize fuse params can't be nullptr");
std::string in_converted = in_var;
std::string in_converted = (fused_op_ids.empty()) ? in_var : GetOutputVarName(in_var, fused_op_ids[0].first);
Datatype input_type = (fused_op_ids.empty()) ? in_type : fused_op_ids[0].second;
Datatype tmp_type = Datatype::F32;
std::string tmp_type_str = GetType(tmp_type, vec_size);
std::string tmp_var = out_var + "_tmp";
if (in_type != tmp_type) {
in_converted = ConvertToType(in_var, tmp_type, vec_size);
if (input_type != tmp_type) {
in_converted = ConvertToType(in_converted, tmp_type, vec_size);
}
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
@@ -1618,7 +1642,9 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
case KernelType::ACTIVATION: {
auto p = desc.GetOpParams<activation_fuse_params>();
base_activation_params activation_p = p->param;
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(in_var, vec_size) + ";";
std::string new_in_var = (fused_op_ids.empty()) ? in_var : GetOutputVarName(in_var, fused_op_ids[0].first);
op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(new_in_var, vec_size) + ";";
if (activation_p.function != ActivationFunction::NONE) {
auto suffix = "_FUSED_OP"+std::to_string(desc.op_id) + conf.suffix;
std::string nl_m = std::to_string(activation_p.m);
@@ -1784,12 +1810,12 @@ std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id, bool is_shuf
return GetTypeStr() + std::to_string(desc.op_id) + "_data" + std::to_string(input_id);
}
std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var) const {
std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var, size_t op_id) const {
std::replace(input_var.begin(), input_var.end(), '[', '_');
std::replace(input_var.begin(), input_var.end(), ']', '_');
std::replace(input_var.begin(), input_var.end(), ' ', '_');
std::replace(input_var.begin(), input_var.end(), '.', '_');
return input_var + "_out";
return input_var + "_out_" + std::to_string(op_id);
}
std::string FusedOpsCodeGenerator::GetType(Datatype dt, size_t vec_size) const {

View File

@@ -340,7 +340,7 @@ public:
JitConstants MakeLoadJitConstants(const FusedOpsConfiguration& conf, const DataTensor prim_output) const;
JitConstants MakeOpJitConstants(const FusedOpsConfiguration& conf,
const std::string in_var, const Datatype in_type,
std::string& out_var, Datatype& out_type) const;
std::string& out_var) const;
bool CanPreloadData(const FusedOpsConfiguration& conf) const;
@@ -353,7 +353,7 @@ public:
std::string GetIdx(size_t input_id, idx_desc idx, bool should_be_safe) const;
std::string GetInputPtrName(size_t input_id) const;
std::string GetInputVarName(size_t input_id, bool is_shuffled = false, std::string shuffle_var = "") const;
std::string GetOutputVarName(std::string input_var_name) const;
std::string GetOutputVarName(std::string input_var_name, size_t op_id) const;
std::string ConvertToOutputType(std::string var, size_t vec_size = 1) const;
std::string ConvertToType(std::string var, Datatype dt, size_t vec_size = 1) const;
std::string CastToType(std::string var, Datatype dt, size_t vec_size = 1) const;

View File

@@ -115,17 +115,14 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
std::string fused_ops_preload;
std::string fused_ops_calc;
std::string in_name = c.input_var_name;
std::string out_name = "";
Datatype in_type = c.input_dt;
bool can_all_use_preload = true;
for (size_t i = 0; i < params.fused_ops.size(); i++) {
auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
std::string out_var;
Datatype out_type;
jit.Merge(fused_dep_codegen.MakeLoadJitConstants(c, params.output));
jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_var, out_type));
in_name = out_var;
in_type = out_type;
jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_name));
bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
can_all_use_preload &= can_use_preload;
@@ -145,7 +142,7 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
jit.AddConstant(MakeJitConstant("FUSED_OPS" + c.suffix, fused_ops));
jit.AddConstant(MakeJitConstant("FUSED_OPS_PRELOAD" + c.suffix, fused_ops_preload));
jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc));
jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, in_name));
jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, out_name));
bool can_any_use_preload = !fused_ops_preload.empty();
jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix,

View File

@@ -558,6 +558,7 @@ struct fused_operation_desc {
MultiDataTensor tensors;
DataTensor output_tensor;
size_t op_id;
std::vector<std::pair<size_t, Datatype>> fused_op_ids;
// Helper functions for operation generation
KernelType GetType() const { return op_params->GetType(); }

View File

@@ -45,10 +45,12 @@
#include "extract_image_patches_inst.h"
#include "reduce_inst.h"
#include <vector>
#include <map>
#include <list>
#include <memory>
#include <string>
#include <utility>
#include <deque>
#include "error_handler.h"
void prepare_primitive_fusing::run(program_impl& p) {
@@ -164,12 +166,13 @@ void prepare_primitive_fusing::fuse_reorders(program_impl &p) {
void prepare_primitive_fusing::fuse_activations(program_impl &p) {
bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
auto itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {
auto node_itr = itr++;
auto& node = (*node_itr);
program_helpers::do_for_types<activation>(*node, [&p, &is_debug](activation_node& node) {
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history](activation_node& node) {
auto& input = node.input();
auto id = node.id();
// Restrictions:
@@ -226,7 +229,7 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
} else {
// If node already has any fused node using new mechanism,
// we can just use the same way and handle any amount of activations
p.fuse_nodes(input, node);
p.fuse_nodes(input, node, &fusing_history);
}
p.add_optimized_primitive_info(id, {input.id()});
@@ -350,6 +353,7 @@ void prepare_primitive_fusing::fuse_bias(program_impl &p) {
void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
bool recalc_processing_order = false;
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
auto itr = p.get_processing_order().begin();
while (itr != p.get_processing_order().end()) {
@@ -497,9 +501,63 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
return true;
};
auto get_users_from_fusing_history = [&](primitive_id id) {
std::vector<primitive_id> users;
for (auto deps_data : fusing_history) {
auto key = deps_data.first;
auto deps_vec = deps_data.second;
auto iter = std::find(deps_vec.begin(), deps_vec.end(), id);
if (iter != deps_vec.end()) {
users.push_back(key);
}
}
return users;
};
auto input_data_supports_fusings = [&](cldnn::program_node& input_data, primitive_id current_node_id) -> bool {
if (input_data.get_users().size() != 1) {
// If input_data has fused primitives,
// find original dependency of current_node using fusing_history
// and check the number of users of it.
// If the node has multiple users it's not fusible.
if (input_data.has_fused_primitives()) {
size_t num_original_dependencies = 0;
auto iter = fusing_history.find(current_node_id);
if (iter != fusing_history.end()) {
// Find current_node's original dependency list
for (auto& prim_id : iter->second) {
// find input_data's fused_prims in the prim_deps_ids
auto& fused_descs = input_data.get_fused_primitives();
auto origin_input_iter = std::find_if(fused_descs.begin(), fused_descs.end(),
[&](cldnn::fused_primitive_desc& desc) {
return (desc.node->id() == prim_id);
});
if (origin_input_iter != fused_descs.end()) {
auto users = get_users_from_fusing_history(origin_input_iter->node->id());
if (users.size() != 1) {
return false;
}
num_original_dependencies++;
}
}
}
// If num_original_dependencies is zero, input_data is original parent
if (num_original_dependencies == 0) {
return false;
}
} else {
return false;
}
}
return true;
};
auto fuse_activation_f = [&](activation_node& activation_node) {
auto& input_data = activation_node.get_dependency(0);
if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
if (activation_node.get_dependencies().size() >= 3)
return;
if (!input_data_supports_fusings(input_data, activation_node.id()))
return;
bool should_fuse = input_data.is_type<binary_convolution>();
@@ -558,7 +616,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
if (!should_fuse)
return;
p.fuse_nodes(input_data, activation_node);
p.fuse_nodes(input_data, activation_node, &fusing_history);
};
auto fuse_scale_f = [&](scale_node& scale_node) {
@@ -623,7 +681,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
if (!should_fuse)
return;
p.fuse_nodes(input_data, scale_node);
p.fuse_nodes(input_data, scale_node, &fusing_history);
};
auto fuse_quantize_f = [&](quantize_node& quantize_node) {
@@ -717,7 +775,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
if (!should_fuse)
return;
p.fuse_nodes(input_data, quantize_node);
p.fuse_nodes(input_data, quantize_node, &fusing_history);
};
auto fuse_eltwise_f = [&](eltwise_node& node) {
@@ -811,8 +869,83 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
if (parent2->is_type<convolution>() && !conv_supports_fusings(parent2->as<convolution>()))
return;
// This fusing can be extended to support peer node in any layout
bool merge_allowed = fused_node->get_users().size() == 1;
bool merge_allowed = true;
// If fused node is not convolution and fused node has multiple users,
// follow the legacy checking rule
if (fused_node->is_type<convolution>() && fused_node->get_users().size() > 1) {
// Allowed new pattern: Elt1, Act, Elt2, Elt3, Elt4 are fused to Conv1
// * Conv1 -> Eltw1(Add) -> Act(Clamp) -> Eltw2(Mul) -> Eltw3(Mul) -> Eltw4(Add) -> Conv2
// * \----------------------------------->/ \---------> Eltw5(Div)
//
// Extended eltwise fusiblity checking rules
//
// 1. All fusing nodes should be eltwise or activation node
// 2. All intermediate fusing nodes except last fusing node(i.e. Elt4) should have only eltwise or activation node as user.
// 3. Currently eltwise and activations are allowed to be fused from multiple branches,
// but technically other fusable operations can be allowed too in the future.
// 4. When node_queue has only one node, the while loop is ended and this node is fused to fused node(Conv1)
// node_queue having one node means all user nodes from fused node(Conv1) converge at that node.
// 5. if node_queue has multiple nodes even if the level of current_node is max_levels, it cannot be fused.
std::deque<std::pair<cldnn::program_node*, size_t>> node_queue; //std::pair<cldnn::program_node*, layer level>
std::vector<cldnn::program_node*> node_history;
node_queue.push_back(std::make_pair(fused_node, 0));
const uint8_t max_levels = 5;
do {
// Pop the current node from node_queue
// Add the current node to the node_history to verfiy the trace of checking
auto current_node = node_queue.front();
node_queue.pop_front();
if (std::find(node_history.begin(), node_history.end(), current_node.first) == node_history.end()) {
node_history.push_back(current_node.first);
}
if (current_node.second > max_levels) {
return;
}
// Push node to node_queue
// If the node is already existed in node_queue, do not add it to the node_queue.
auto push_node_queue = [&](cldnn::program_node* in_node, size_t level) {
auto iter = std::find_if(node_queue.begin(), node_queue.end(), [&](std::pair<cldnn::program_node*, size_t> element) {
return (in_node->id() == element.first->id());
});
if (iter == node_queue.end()) {
node_queue.push_back(std::make_pair(in_node, level));
}
};
// If the any user node is not eltwise(mul / add mode) and activation,
// the current node will be considered as last node and put it back into the node_queue
auto curr_users = current_node.first->get_users();
auto invalid_user_iter = std::find_if(curr_users.begin(), curr_users.end(), [&](cldnn::program_node* user) {
return (user->is_output() ||
(!(user->is_type<eltwise>() && user->get_primitive()->input.size() == 2 &&
(std::find(supported_modes.begin(), supported_modes.end(),
(user->as<eltwise>()).get_primitive()->mode) != supported_modes.end())) &&
!(user->is_type<activation>() && user->get_primitive()->input.size() == 1)));
});
if (invalid_user_iter != curr_users.end()) {
// If fused_node(i.e. Conv1) have invalid user node(that is not activation and eltwise ndoe), it cannot be fused
if (fused_node->id() == current_node.first->id()) {
return;
}
push_node_queue(current_node.first, (current_node.second+1));
continue;
}
// Add user node in current node to the queue
// But, do not add the node that passed once, it is checked using node_history
for (auto& user : curr_users) {
auto iter = std::find(node_history.begin(), node_history.end(), user);
if (iter == node_history.end())
push_node_queue(user, current_node.second+1);
}
} while (node_queue.size() > 1);
} else {
merge_allowed = fused_node->get_users().size() == 1;
}
for (auto& parent : fused_node->get_dependencies())
if (parent->id() == peer_node->id())
@@ -831,7 +964,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
recalc_processing_order = true;
}
p.fuse_nodes(*fused_node, node);
p.fuse_nodes(*fused_node, node, &fusing_history);
};
program_helpers::do_for_types<activation, scale, quantize, eltwise>(*node,
@@ -861,6 +994,22 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
// 2. fuse conv bias to quantize shift
auto& fused_prims = node->get_fused_primitives();
auto remove_deps_of_node = [&](cldnn::fused_primitive_desc& desc) {
for (auto& prim : fused_prims) {
if (desc.node->id() == prim.node->id()) {
continue;
}
auto rm_iter = std::find_if(prim.fused_deps.begin(), prim.fused_deps.end(), [&](primitive_id& dep_id){
return (desc.node->id() == dep_id);
});
if (rm_iter != prim.fused_deps.end()) {
prim.fused_deps.erase(rm_iter);
prim.fused_deps.insert(prim.fused_deps.end(), desc.fused_deps.begin(), desc.fused_deps.end());
}
}
};
// Drop relu if the next fused op is quantize with u8 output and no in_shift
auto fp_itr = fused_prims.begin();
while (fp_itr != fused_prims.end()) {
@@ -883,6 +1032,7 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
!quantize_node.get_need_pre_shift();
if (can_skip) {
remove_deps_of_node(fp);
fp_itr = fused_prims.erase(curr_itr);
}
}
@@ -891,6 +1041,7 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
}
void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program_node* node) {
std::map<primitive_id, std::vector<primitive_id>> fusing_history;
// make sure this convolution have only 1 user and it's depth_to_space
// make sure convolution is not an output
if (node->get_users().size() != 1 || node->is_output())
@@ -919,7 +1070,7 @@ void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program
return;
}
p.fuse_nodes(*conv_node, *d_t_s_node);
p.fuse_nodes(*conv_node, *d_t_s_node, &fusing_history);
}
void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* node) {

View File

@@ -159,6 +159,7 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) {
params.layerID = arg.id();
convert_fused_activation_func_params(arg, params.activations);
std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_op_id_map;
size_t op_id = 0;
for (auto& fused_prim : arg.get_fused_primitives()) {
kernel_selector::fused_operation_desc desc;
@@ -171,6 +172,13 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) {
desc.dep_size = fused_prim.deps.size();
desc.op_id = op_id++;
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
prim_op_id_map[fused_prim.node->id()] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
for (auto& dep : fused_prim.fused_deps) {
auto iter = prim_op_id_map.find(dep);
if (iter != prim_op_id_map.end()) {
desc.fused_op_ids.push_back(iter->second);
}
}
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout()));

View File

@@ -186,7 +186,7 @@ public:
bool extract_and_remove(program_node& node);
// Fuses two nodes into fused_node and removes peer_node from graph
void fuse_nodes(program_node& fused_node, program_node& peer_node);
void fuse_nodes(program_node& fused_node, program_node& peer_node, std::map<primitive_id, std::vector<primitive_id>>* fusing_history);
// returns if 'node' has been removed
bool remove_if_dangling(program_node& node);

View File

@@ -40,6 +40,7 @@ struct fused_primitive_desc {
std::shared_ptr<program_node> node;
size_t dep_start_idx;
std::vector<primitive_id> deps;
std::vector<primitive_id> fused_deps;
activation_func activation;
activation_additional_params activation_params;
layout output_layout = layout(data_types::f32, format::bfyx, tensor());

View File

@@ -895,7 +895,7 @@ bool program_impl::extract_and_remove(program_node& node) {
return true;
}
void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node) {
void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node, std::map<primitive_id, std::vector<primitive_id>>* fusing_history) {
auto peer_layout = peer_node.get_output_layout();
fused_primitive_desc local_desc;
local_desc.node = get_node_ptr(peer_node.id());
@@ -913,6 +913,13 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
cldnn::padding needed_padding = padding::max(peer_layout.data_padding,
fused_node.get_output_layout().data_padding);
auto history_iter = fusing_history->find(peer_node.id());
if (history_iter != fusing_history->end()) {
for (auto& id : history_iter->second) {
local_desc.fused_deps.push_back(id);
}
}
// Add new dependencies to the fused_node
for (size_t i = 0; i < peer_node.get_dependencies().size(); i++) {
auto& dep = peer_node.get_dependency(i);
@@ -952,6 +959,10 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
}
add_optimized_primitive_info(peer_node.id(), { fused_node.id() });
for (auto& user : peer_node.users) {
(*fusing_history)[user->id()].push_back(peer_node.id());
}
// Remove all edges connected with peer node
while (peer_node.get_dependencies().size() > 0) {
auto& dep = peer_node.get_dependency(peer_node.get_dependencies().size() - 1);

View File

@@ -323,6 +323,8 @@ public:
return layout{ p.data_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1} };
}
layout get_single_element_layout(T& p) {
return layout{ p.default_type, p.default_format, tensor{1, 1, 1, 1} };
}
@@ -831,6 +833,212 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_prelu_eltwise,
bc_test_params{CASE_CONV_FP16_4, 2, 4},
}), );
class conv_fp32_multi_eltwise_2 : public ConvFusingTest {};
TEST_P(conv_fp32_multi_eltwise_2, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("eltwise_data", get_mem(get_output_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("weights", get_mem(get_weights_layout(p))),
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise1", "conv_prim", "eltwise_data", eltwise_mode::sum),
eltwise("eltwise2", "eltwise1", "conv_prim", eltwise_mode::prod),
reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_2,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_FP32_2, 2, 4},
bc_test_params{CASE_CONV_FP32_3, 2, 4},
bc_test_params{CASE_CONV_FP32_4, 2, 4},
bc_test_params{CASE_CONV_FP16_2, 2, 4},
bc_test_params{CASE_CONV_FP16_3, 2, 4},
bc_test_params{CASE_CONV_FP16_4, 2, 4},
}), );
class conv_fp32_multi_eltwise_2_clamp : public ConvFusingTest {};
TEST_P(conv_fp32_multi_eltwise_2_clamp, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("eltwise1_data", get_mem(get_output_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("weights", get_mem(get_weights_layout(p))),
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise1", "conv_prim", "eltwise1_data", eltwise_mode::sum),
activation("activation", "eltwise1", activation_func::clamp, {0.5f, 2.5f}),
eltwise("eltwise2", "activation", "conv_prim", eltwise_mode::prod),
reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_2_clamp,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_FP32_2, 2, 5},
bc_test_params{CASE_CONV_FP32_3, 2, 5},
bc_test_params{CASE_CONV_FP32_4, 2, 5},
bc_test_params{CASE_CONV_FP16_2, 2, 5},
bc_test_params{CASE_CONV_FP16_3, 2, 5},
bc_test_params{CASE_CONV_FP16_4, 2, 5},
}), );
class conv_fp32_multi_eltwise_4_clamp : public ConvFusingTest {};
TEST_P(conv_fp32_multi_eltwise_4_clamp, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("eltwise1_data", get_mem(get_output_layout(p))),
data("eltwise2_data", get_mem(get_output_layout(p))),
data("eltwise4_data", get_mem(get_output_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("weights", get_mem(get_weights_layout(p))),
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise1_add", "conv_prim", "eltwise1_data", eltwise_mode::sum),
activation("activation", "eltwise1_add", activation_func::clamp, {0.5f, 2.5f}),
eltwise("eltwise2_mul", "activation", "conv_prim", eltwise_mode::prod),
eltwise("eltwise3_div", "eltwise2_mul", "eltwise2_data", eltwise_mode::prod),
eltwise("eltwise4_add", "eltwise3_div", "eltwise4_data", eltwise_mode::sum),
reorder("reorder_bfyx", "eltwise4_add", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_4_clamp,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_FP32_2, 2, 7},
bc_test_params{CASE_CONV_FP32_3, 2, 7},
bc_test_params{CASE_CONV_FP32_4, 2, 7},
bc_test_params{CASE_CONV_FP16_2, 2, 7},
bc_test_params{CASE_CONV_FP16_3, 2, 7},
bc_test_params{CASE_CONV_FP16_4, 2, 7},
}), );
class conv_fp32_multi_eltwise_3_fusing : public ConvFusingTest {};
TEST_P(conv_fp32_multi_eltwise_3_fusing, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("eltwise_data1", get_mem(get_output_layout(p))),
data("eltwise_data2", get_mem(get_output_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("weights", get_mem(get_weights_layout(p))),
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum),
eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod),
reorder("reorder_bfyx", "eltwise3", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_3_fusing,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_FP32_2, 2, 5},
bc_test_params{CASE_CONV_FP32_3, 2, 5},
bc_test_params{CASE_CONV_FP32_4, 2, 5},
bc_test_params{CASE_CONV_FP16_2, 2, 5},
bc_test_params{CASE_CONV_FP16_3, 2, 5},
bc_test_params{CASE_CONV_FP16_4, 2, 5},
}), );
class conv_fp32_multi_eltwise_quantization : public ConvFusingTest {};
TEST_P(conv_fp32_multi_eltwise_quantization, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), -127)),
data("out_hi", get_mem(get_single_element_layout(p), 127)),
data("eltwise_data1", get_mem(get_output_layout(p))),
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
eltwise("eltwise2", "eltwise1", "quantize", eltwise_mode::prod),
reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_quantization,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_FP32_2, 4, 5},
bc_test_params{CASE_CONV_FP32_4, 4, 5},
bc_test_params{CASE_CONV_FP16_2, 4, 5},
bc_test_params{CASE_CONV_FP16_3, 4, 5},
bc_test_params{CASE_CONV_FP16_4, 4, 5},
}), );
class conv_fp32_multi_eltwise_concat : public ConvFusingTest {};
TEST_P(conv_fp32_multi_eltwise_concat, basic) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("eltwise_data1", get_mem(get_output_layout(p))),
data("eltwise_data2", get_mem(get_output_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("weights", get_mem(get_weights_layout(p))),
convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum),
concatenation("concat",
{"eltwise1", "eltwise2"},
concatenation::concatenation_axis::along_f,
data_types::i8,
padding{{0, 0, 0, 0}, 0}),
reorder("reorder_bfyx", "concat", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_concat,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_FP32_2, 5, 5},
bc_test_params{CASE_CONV_FP32_3, 5, 5},
bc_test_params{CASE_CONV_FP32_4, 5, 5},
bc_test_params{CASE_CONV_FP16_2, 5, 5},
bc_test_params{CASE_CONV_FP16_3, 5, 5},
bc_test_params{CASE_CONV_FP16_4, 5, 5},
}), );
class conv_fp32_eltwise_b_fs_zyx_fsv16 : public ConvFusingTest {};
TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) {