[IE CLDNN] Extended eltwise fusing (#5181)

* [cldnn] Add initial fused conv eltw POC - Add cldnn unit test - Add fused dependency list to the fused_primitive_desc - fuse_nodes update for saving fusing history and depenecies - Modify Jitter to create jit constants using fused dependencies - Add cldnn unit-test cases for multiple serial and parallel eltwise fuse pattern - Modify Jitter and add default values in sum input Signed-off-by: Ahn, Paul Y <paul.y.ahn@intel.com> Co-authored-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * [cldnn] Update fused_conv_eltwise cldnn unit test - Add execute and compare function - Add cldnn unit-test case for multiple parallel eltwise and additional eltwise - Add cldnn unit-test case for combination of multiple parallel eltw - Add cldnn unit-test cases for serial and diverged quantize and eltwise Signed-off-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com> * [cldnn] Modify checking fusibility of eltwise fusing - Add new checking fusibility rule in prepare_primitive_fusing - Move cldnn eltwise fusing test to fusing_gpu_test.cpp - Modify method to get input var name in jitter Signed-off-by: Ahn, Paul Y <paul.y.ahn@intel.com> * [cldnn] Fix fusing item type and activation fusibility checking condition - Extract input_data_supports_fusings from fuse_activaion_f - Fix checking supported mode bug Co-authored-by: Andrew Kwangwoong Park <andrew.kwangwoong.park@intel.com>
2021-05-04 15:57:06 +09:00
parent b47d11e31e
commit 29a8be523d
10 changed files with 443 additions and 40 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
@@ -1476,7 +1476,7 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura

 JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfiguration& conf,
                                                       const std::string in_var, const Datatype in_type,
-                                                       std::string& out_var, Datatype& out_type) const {
+                                                       std::string& out_var) const {
    JitConstants jit = {};

    std::string op_decls = "";
@@ -1484,9 +1484,11 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
    auto idx = conf.bfzyx_idx_order;
    std::string shuffle_var = conf.shuffle_var_name;
    bool is_shuffled = false;
+    auto& fused_op_ids = desc.fused_op_ids;
+    std::vector<std::string> input_vars;

-    out_var = GetOutputVarName(in_var);
-    out_type = desc.output_tensor.GetDType();
+    out_var = GetOutputVarName(in_var, desc.op_id);
+    const auto& out_type = desc.output_tensor.GetDType();

    if (conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE &&
        (desc.GetType() == KernelType::SCALE || desc.GetType() == KernelType::QUANTIZE)) {
@@ -1503,15 +1505,19 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
    }

    auto get_acc_t = [&]() -> Datatype {
-        std::vector<Datatype> tensor_types = {desc.output_tensor.GetDType()};
+        std::vector<Datatype> input_types = {desc.output_tensor.GetDType()};
        for (auto& in : desc.tensors) {
-            tensor_types.push_back(in.GetDType());
+            input_types.push_back(in.GetDType());
+        }
+
+        for (auto& in : fused_op_ids) {
+            input_types.push_back(in.second);
        }

        std::vector<Datatype> types_prioritized = { Datatype::F32, Datatype::F16 };

        for (auto& type : types_prioritized) {
-            if (std::any_of(tensor_types.begin(), tensor_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
+            if (std::any_of(input_types.begin(), input_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
                return type;
            }
        }
@@ -1520,25 +1526,42 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
    };

    auto get_input = [&](size_t index) -> std::string {
-        auto in_name = index == 0 ? in_var : GetInputVarName(index - 1, is_shuffled, shuffle_var);
-        auto tensor_type = index == 0 ? in_type : desc.tensors[index - 1].GetDType();
+        auto input_name = in_var;
+        auto input_type = in_type;
+        if (index > 0) {
+            size_t input_idx = index - 1;
+            size_t tensors_len = desc.tensors.size();
+            input_name = (input_idx < tensors_len)? GetInputVarName(input_idx, is_shuffled, shuffle_var)
+                                                    : GetOutputVarName(in_var, fused_op_ids[input_idx - tensors_len].first);
+            input_type = (input_idx < tensors_len)? desc.tensors[input_idx].GetDType() : fused_op_ids[input_idx - tensors_len].second;
+        }
        auto acc_t = get_acc_t();

-        if (tensor_type != acc_t)
-            return ConvertToType(in_name, acc_t, vec_size);
+        if (input_type != acc_t)
+            return ConvertToType(input_name, acc_t, vec_size);
        else
-            return in_name;
+            return input_name;
    };

+    // Generate input variable list: dst + tensor inputs + fused ops input
+    // If the input_vars_length are larger than max_num_input_vars, do not add dst to input variable list.
+    // because dst is not used, when Fused op has both tensor and fused input.
+    size_t input_vars_length = 1 + desc.tensors.size() + fused_op_ids.size();   // dst + tensor inputs + fused ops input
+    size_t max_num_input_vars = (desc.tensors.size() > 1)? 3 : 2;
+    size_t start_idx = (input_vars_length > max_num_input_vars) ? 1 : 0;
+    for (size_t i = start_idx; i < input_vars_length; i++) {
+        input_vars.push_back(get_input(i));
+    }
+
    switch (desc.GetType()) {
        case KernelType::SCALE: {
            auto tmp_var = out_var + "_tmp";
            if (desc.tensors.size() > 1) {
                op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
-                          + get_input(0) + " * " + get_input(1) + " + " + get_input(2) + ";";
+                          + input_vars[0] + " * " + input_vars[1] + " + " + input_vars[2] + ";";
            } else {
                op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
-                          + get_input(0) + " * " + get_input(1) + ";";
+                          + input_vars[0] + " * " + input_vars[1] + ";";
            }
            op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
            break;
@@ -1561,7 +1584,7 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
            }

            auto tmp_var = out_var + "_tmp";
-            op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + get_input(0) + op + get_input(1) + ";";
+            op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = " + input_vars[0] + op + input_vars[1] + ";";
            op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
            break;
        }
@@ -1570,13 +1593,14 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
            if (!p)
                throw std::runtime_error("[clDNN] Quantize fuse params can't be nullptr");

-            std::string in_converted = in_var;
+            std::string in_converted = (fused_op_ids.empty()) ? in_var : GetOutputVarName(in_var, fused_op_ids[0].first);
+            Datatype input_type = (fused_op_ids.empty()) ? in_type : fused_op_ids[0].second;
            Datatype tmp_type = Datatype::F32;
            std::string tmp_type_str = GetType(tmp_type, vec_size);
            std::string tmp_var = out_var + "_tmp";

-            if (in_type != tmp_type) {
-                in_converted = ConvertToType(in_var, tmp_type, vec_size);
+            if (input_type != tmp_type) {
+                in_converted = ConvertToType(in_converted, tmp_type, vec_size);
            }

            auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
@@ -1618,7 +1642,9 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
        case KernelType::ACTIVATION: {
            auto p = desc.GetOpParams<activation_fuse_params>();
            base_activation_params activation_p = p->param;
-            op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(in_var, vec_size) + ";";
+
+            std::string new_in_var = (fused_op_ids.empty()) ? in_var : GetOutputVarName(in_var, fused_op_ids[0].first);
+            op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(new_in_var, vec_size) + ";";
            if (activation_p.function != ActivationFunction::NONE) {
                auto suffix = "_FUSED_OP"+std::to_string(desc.op_id) + conf.suffix;
                std::string nl_m = std::to_string(activation_p.m);
@@ -1784,12 +1810,12 @@ std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id, bool is_shuf
    return GetTypeStr() + std::to_string(desc.op_id) + "_data" + std::to_string(input_id);
 }

-std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var) const {
+std::string FusedOpsCodeGenerator::GetOutputVarName(std::string input_var, size_t op_id) const {
    std::replace(input_var.begin(), input_var.end(), '[', '_');
    std::replace(input_var.begin(), input_var.end(), ']', '_');
    std::replace(input_var.begin(), input_var.end(), ' ', '_');
    std::replace(input_var.begin(), input_var.end(), '.', '_');
-    return input_var + "_out";
+    return input_var + "_out_" + std::to_string(op_id);
 }

 std::string FusedOpsCodeGenerator::GetType(Datatype dt, size_t vec_size) const {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h
@@ -340,7 +340,7 @@ public:
    JitConstants MakeLoadJitConstants(const FusedOpsConfiguration& conf, const DataTensor prim_output) const;
    JitConstants MakeOpJitConstants(const FusedOpsConfiguration& conf,
                                    const std::string in_var, const Datatype in_type,
-                                    std::string& out_var, Datatype& out_type) const;
+                                    std::string& out_var) const;

    bool CanPreloadData(const FusedOpsConfiguration& conf) const;

@@ -353,7 +353,7 @@ public:
    std::string GetIdx(size_t input_id, idx_desc idx, bool should_be_safe) const;
    std::string GetInputPtrName(size_t input_id) const;
    std::string GetInputVarName(size_t input_id, bool is_shuffled = false, std::string shuffle_var = "") const;
-    std::string GetOutputVarName(std::string input_var_name) const;
+    std::string GetOutputVarName(std::string input_var_name, size_t op_id) const;
    std::string ConvertToOutputType(std::string var, size_t vec_size = 1) const;
    std::string ConvertToType(std::string var, Datatype dt, size_t vec_size = 1) const;
    std::string CastToType(std::string var, Datatype dt, size_t vec_size = 1) const;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
@@ -115,17 +115,14 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
            std::string fused_ops_preload;
            std::string fused_ops_calc;
            std::string in_name = c.input_var_name;
+            std::string out_name = "";
            Datatype in_type = c.input_dt;
            bool can_all_use_preload = true;

            for (size_t i = 0; i < params.fused_ops.size(); i++) {
                auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
-                std::string out_var;
-                Datatype out_type;
                jit.Merge(fused_dep_codegen.MakeLoadJitConstants(c, params.output));
-                jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_var, out_type));
-                in_name = out_var;
-                in_type = out_type;
+                jit.Merge(fused_dep_codegen.MakeOpJitConstants(c, in_name, in_type, out_name));

                bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
                can_all_use_preload &= can_use_preload;
@@ -145,7 +142,7 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
            jit.AddConstant(MakeJitConstant("FUSED_OPS" + c.suffix, fused_ops));
            jit.AddConstant(MakeJitConstant("FUSED_OPS_PRELOAD" + c.suffix, fused_ops_preload));
            jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc));
-            jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, in_name));
+            jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, out_name));

            bool can_any_use_preload = !fused_ops_preload.empty();
            jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix,
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
@@ -558,6 +558,7 @@ struct fused_operation_desc {
    MultiDataTensor tensors;
    DataTensor output_tensor;
    size_t op_id;
+    std::vector<std::pair<size_t, Datatype>> fused_op_ids;

    // Helper functions for operation generation
    KernelType GetType() const { return op_params->GetType(); }
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@@ -45,10 +45,12 @@
 #include "extract_image_patches_inst.h"
 #include "reduce_inst.h"
 #include <vector>
+#include <map>
 #include <list>
 #include <memory>
 #include <string>
 #include <utility>
+#include <deque>
 #include "error_handler.h"

 void prepare_primitive_fusing::run(program_impl& p) {
@@ -164,12 +166,13 @@ void prepare_primitive_fusing::fuse_reorders(program_impl &p) {

 void prepare_primitive_fusing::fuse_activations(program_impl &p) {
    bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
+    std::map<primitive_id, std::vector<primitive_id>> fusing_history;
    auto itr = p.get_processing_order().begin();
    while (itr != p.get_processing_order().end()) {
        auto node_itr = itr++;
        auto& node = (*node_itr);

-        program_helpers::do_for_types<activation>(*node, [&p, &is_debug](activation_node& node) {
+        program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history](activation_node& node) {
            auto& input = node.input();
            auto id = node.id();
            // Restrictions:
@@ -226,7 +229,7 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
            } else {
                // If node already has any fused node using new mechanism,
                // we can just use the same way and handle any amount of activations
-                p.fuse_nodes(input, node);
+                p.fuse_nodes(input, node, &fusing_history);
            }

            p.add_optimized_primitive_info(id, {input.id()});
@@ -350,6 +353,7 @@ void prepare_primitive_fusing::fuse_bias(program_impl &p) {

 void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
    bool recalc_processing_order = false;
+    std::map<primitive_id, std::vector<primitive_id>> fusing_history;

    auto itr = p.get_processing_order().begin();
    while (itr != p.get_processing_order().end()) {
@@ -497,9 +501,63 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
            return true;
        };

+        auto get_users_from_fusing_history = [&](primitive_id id) {
+            std::vector<primitive_id> users;
+            for (auto deps_data : fusing_history) {
+                auto key = deps_data.first;
+                auto deps_vec = deps_data.second;
+                auto iter = std::find(deps_vec.begin(), deps_vec.end(), id);
+                if (iter != deps_vec.end()) {
+                    users.push_back(key);
+                }
+            }
+            return users;
+        };
+
+        auto input_data_supports_fusings = [&](cldnn::program_node& input_data, primitive_id current_node_id) -> bool {
+            if (input_data.get_users().size() != 1) {
+                // If input_data has fused primitives,
+                // find original dependency of current_node using fusing_history
+                // and check the number of users of it.
+                // If the node has multiple users it's not fusible.
+                if (input_data.has_fused_primitives()) {
+                    size_t num_original_dependencies = 0;
+                    auto iter = fusing_history.find(current_node_id);
+                    if (iter != fusing_history.end()) {
+                        // Find current_node's original dependency list
+                        for (auto& prim_id : iter->second) {
+                            // find input_data's fused_prims in the prim_deps_ids
+                            auto& fused_descs = input_data.get_fused_primitives();
+                            auto origin_input_iter = std::find_if(fused_descs.begin(), fused_descs.end(),
+                                                                    [&](cldnn::fused_primitive_desc& desc) {
+                                return (desc.node->id() == prim_id);
+                            });
+                            if (origin_input_iter != fused_descs.end()) {
+                                auto users = get_users_from_fusing_history(origin_input_iter->node->id());
+                                if (users.size() != 1) {
+                                    return false;
+                                }
+                                num_original_dependencies++;
+                            }
+                        }
+                    }
+                    // If num_original_dependencies is zero, input_data is original parent
+                    if (num_original_dependencies == 0) {
+                        return false;
+                    }
+                } else {
+                    return false;
+                }
+            }
+            return true;
+        };
+
        auto fuse_activation_f = [&](activation_node& activation_node) {
            auto& input_data = activation_node.get_dependency(0);
-            if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
+            if (activation_node.get_dependencies().size() >= 3)
+                return;
+
+            if (!input_data_supports_fusings(input_data, activation_node.id()))
                return;

            bool should_fuse = input_data.is_type<binary_convolution>();
@@ -558,7 +616,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
            if (!should_fuse)
                return;

-            p.fuse_nodes(input_data, activation_node);
+            p.fuse_nodes(input_data, activation_node, &fusing_history);
        };

        auto fuse_scale_f = [&](scale_node& scale_node) {
@@ -623,7 +681,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
            if (!should_fuse)
                return;

-            p.fuse_nodes(input_data, scale_node);
+            p.fuse_nodes(input_data, scale_node, &fusing_history);
        };

        auto fuse_quantize_f = [&](quantize_node& quantize_node) {
@@ -717,7 +775,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
            if (!should_fuse)
                return;

-            p.fuse_nodes(input_data, quantize_node);
+            p.fuse_nodes(input_data, quantize_node, &fusing_history);
        };

        auto fuse_eltwise_f = [&](eltwise_node& node) {
@@ -811,8 +869,83 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
            if (parent2->is_type<convolution>() && !conv_supports_fusings(parent2->as<convolution>()))
                return;

-            // This fusing can be extended to support peer node in any layout
-            bool merge_allowed = fused_node->get_users().size() == 1;
+            bool merge_allowed = true;
+            // If fused node is not convolution and fused node has multiple users,
+            //  follow the legacy checking rule
+            if (fused_node->is_type<convolution>() && fused_node->get_users().size() > 1) {
+                // Allowed new pattern: Elt1, Act, Elt2, Elt3, Elt4 are fused to Conv1
+                // * Conv1 -> Eltw1(Add) -> Act(Clamp) -> Eltw2(Mul) -> Eltw3(Mul) -> Eltw4(Add) -> Conv2
+                // *   \–----------------------------------->/                          \---------> Eltw5(Div)
+                //
+                // Extended eltwise fusiblity checking rules
+                //
+                // 1. All fusing nodes should be eltwise or activation node
+                // 2. All intermediate fusing nodes except last fusing node(i.e. Elt4) should have only eltwise or activation node as user.
+                // 3. Currently eltwise and activations are allowed to be fused from multiple branches,
+                //      but technically other fusable operations can be allowed too in the future.
+                // 4. When node_queue has only one node, the while loop is ended and this node is fused to fused node(Conv1)
+                //      node_queue having one node means all user nodes from fused node(Conv1) converge at that node.
+                // 5. if node_queue has multiple nodes even if the level of current_node is max_levels, it cannot be fused.
+                std::deque<std::pair<cldnn::program_node*, size_t>> node_queue; //std::pair<cldnn::program_node*, layer level>
+                std::vector<cldnn::program_node*> node_history;
+                node_queue.push_back(std::make_pair(fused_node, 0));
+
+                const uint8_t max_levels = 5;
+                do {
+                    // Pop the current node from node_queue
+                    // Add the current node to the node_history to verfiy the trace of checking
+                    auto current_node = node_queue.front();
+                    node_queue.pop_front();
+                    if (std::find(node_history.begin(), node_history.end(), current_node.first) == node_history.end()) {
+                        node_history.push_back(current_node.first);
+                    }
+
+                    if (current_node.second > max_levels) {
+                        return;
+                    }
+
+                    // Push node to node_queue
+                    // If the node is already existed in node_queue, do not add it to the node_queue.
+                    auto push_node_queue = [&](cldnn::program_node* in_node, size_t level) {
+                        auto iter = std::find_if(node_queue.begin(), node_queue.end(), [&](std::pair<cldnn::program_node*, size_t> element) {
+                            return (in_node->id() == element.first->id());
+                        });
+                        if (iter == node_queue.end()) {
+                            node_queue.push_back(std::make_pair(in_node, level));
+                        }
+                    };
+
+                    // If the any user node is not eltwise(mul / add mode) and activation,
+                    // the current node will be considered as last node and put it back into the node_queue
+                    auto curr_users = current_node.first->get_users();
+                    auto invalid_user_iter = std::find_if(curr_users.begin(), curr_users.end(), [&](cldnn::program_node* user) {
+                        return (user->is_output() ||
+                                    (!(user->is_type<eltwise>() && user->get_primitive()->input.size() == 2 &&
+                                        (std::find(supported_modes.begin(), supported_modes.end(),
+                                        (user->as<eltwise>()).get_primitive()->mode) != supported_modes.end())) &&
+                                    !(user->is_type<activation>() && user->get_primitive()->input.size() == 1)));
+                    });
+
+                    if (invalid_user_iter != curr_users.end()) {
+                        // If fused_node(i.e. Conv1) have invalid user node(that is not activation and eltwise ndoe), it cannot be fused
+                        if (fused_node->id() == current_node.first->id()) {
+                            return;
+                        }
+                        push_node_queue(current_node.first, (current_node.second+1));
+                        continue;
+                    }
+
+                    // Add user node in current node to the queue
+                    // But, do not add the node that passed once, it is checked using node_history
+                    for (auto& user : curr_users) {
+                        auto iter = std::find(node_history.begin(), node_history.end(), user);
+                        if (iter == node_history.end())
+                            push_node_queue(user, current_node.second+1);
+                    }
+                } while (node_queue.size() > 1);
+            } else {
+                merge_allowed = fused_node->get_users().size() == 1;
+            }

            for (auto& parent : fused_node->get_dependencies())
                if (parent->id() == peer_node->id())
@@ -831,7 +964,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
                recalc_processing_order = true;
            }

-            p.fuse_nodes(*fused_node, node);
+            p.fuse_nodes(*fused_node, node, &fusing_history);
        };

        program_helpers::do_for_types<activation, scale, quantize, eltwise>(*node,
@@ -861,6 +994,22 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
        // 2. fuse conv bias to quantize shift
        auto& fused_prims = node->get_fused_primitives();

+        auto remove_deps_of_node = [&](cldnn::fused_primitive_desc& desc) {
+            for (auto& prim : fused_prims) {
+                if (desc.node->id() == prim.node->id()) {
+                    continue;
+                }
+
+                auto rm_iter = std::find_if(prim.fused_deps.begin(), prim.fused_deps.end(), [&](primitive_id& dep_id){
+                    return (desc.node->id() == dep_id);
+                });
+                if (rm_iter != prim.fused_deps.end()) {
+                    prim.fused_deps.erase(rm_iter);
+                    prim.fused_deps.insert(prim.fused_deps.end(), desc.fused_deps.begin(), desc.fused_deps.end());
+                }
+            }
+        };
+
        // Drop relu if the next fused op is quantize with u8 output and no in_shift
        auto fp_itr = fused_prims.begin();
        while (fp_itr != fused_prims.end()) {
@@ -883,6 +1032,7 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
                                !quantize_node.get_need_pre_shift();

                if (can_skip) {
+                    remove_deps_of_node(fp);
                    fp_itr = fused_prims.erase(curr_itr);
                }
            }
@@ -891,6 +1041,7 @@ void prepare_primitive_fusing::optimize_fused_ops(program_impl& p) {
 }

 void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program_node* node) {
+    std::map<primitive_id, std::vector<primitive_id>> fusing_history;
    // make sure this convolution have only 1 user and it's depth_to_space
    // make sure convolution is not an output
    if (node->get_users().size() != 1 || node->is_output())
@@ -919,7 +1070,7 @@ void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program
            return;
    }

-    p.fuse_nodes(*conv_node, *d_t_s_node);
+    p.fuse_nodes(*conv_node, *d_t_s_node, &fusing_history);
 }

 void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* node) {
--- a/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h
+++ b/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h
@@ -159,6 +159,7 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) {
    params.layerID = arg.id();

    convert_fused_activation_func_params(arg, params.activations);
+    std::map<primitive_id, std::pair<size_t, kernel_selector::Datatype>> prim_op_id_map;
    size_t op_id = 0;
    for (auto& fused_prim : arg.get_fused_primitives()) {
        kernel_selector::fused_operation_desc desc;
@@ -171,6 +172,13 @@ inline params_t get_default_params(const arg_t& arg, uint32_t split = 1) {
        desc.dep_size = fused_prim.deps.size();
        desc.op_id = op_id++;
        desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
+        prim_op_id_map[fused_prim.node->id()] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
+        for (auto& dep : fused_prim.fused_deps) {
+            auto iter = prim_op_id_map.find(dep);
+            if (iter != prim_op_id_map.end()) {
+                desc.fused_op_ids.push_back(iter->second);
+            }
+        }

        for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
            desc.tensors.push_back(convert_data_tensor(arg.get_dependency(i).get_output_layout()));
--- a/inference-engine/thirdparty/clDNN/src/include/program_impl.h
+++ b/inference-engine/thirdparty/clDNN/src/include/program_impl.h
@@ -186,7 +186,7 @@ public:
    bool extract_and_remove(program_node& node);

    // Fuses two nodes into fused_node and removes peer_node from graph
-    void fuse_nodes(program_node& fused_node, program_node& peer_node);
+    void fuse_nodes(program_node& fused_node, program_node& peer_node, std::map<primitive_id, std::vector<primitive_id>>* fusing_history);

    // returns if 'node' has been removed
    bool remove_if_dangling(program_node& node);
--- a/inference-engine/thirdparty/clDNN/src/include/program_node.h
+++ b/inference-engine/thirdparty/clDNN/src/include/program_node.h
@@ -40,6 +40,7 @@ struct fused_primitive_desc {
    std::shared_ptr<program_node> node;
    size_t dep_start_idx;
    std::vector<primitive_id> deps;
+    std::vector<primitive_id> fused_deps;
    activation_func activation;
    activation_additional_params activation_params;
    layout output_layout = layout(data_types::f32, format::bfyx, tensor());
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -895,7 +895,7 @@ bool program_impl::extract_and_remove(program_node& node) {
    return true;
 }

-void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node) {
+void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node, std::map<primitive_id, std::vector<primitive_id>>* fusing_history) {
    auto peer_layout = peer_node.get_output_layout();
    fused_primitive_desc local_desc;
    local_desc.node = get_node_ptr(peer_node.id());
@@ -913,6 +913,13 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
    cldnn::padding needed_padding = padding::max(peer_layout.data_padding,
                                                 fused_node.get_output_layout().data_padding);

+    auto history_iter = fusing_history->find(peer_node.id());
+    if (history_iter != fusing_history->end()) {
+        for (auto& id : history_iter->second) {
+            local_desc.fused_deps.push_back(id);
+        }
+    }
+
    // Add new dependencies to the fused_node
    for (size_t i = 0; i < peer_node.get_dependencies().size(); i++) {
        auto& dep = peer_node.get_dependency(i);
@@ -952,6 +959,10 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node)
    }
    add_optimized_primitive_info(peer_node.id(), { fused_node.id() });

+    for (auto& user : peer_node.users) {
+        (*fusing_history)[user->id()].push_back(peer_node.id());
+    }
+
    // Remove all edges connected with peer node
    while (peer_node.get_dependencies().size() > 0) {
        auto& dep = peer_node.get_dependency(peer_node.get_dependencies().size() - 1);
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@@ -323,6 +323,8 @@ public:
        return layout{ p.data_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1} };
    }

+
+
    layout get_single_element_layout(T& p) {
        return layout{ p.default_type, p.default_format, tensor{1, 1, 1, 1} };
    }
@@ -831,6 +833,212 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_prelu_eltwise,
                                             bc_test_params{CASE_CONV_FP16_4, 2, 4},
                                             }), );

+
+class conv_fp32_multi_eltwise_2 : public ConvFusingTest {};
+TEST_P(conv_fp32_multi_eltwise_2, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("eltwise_data", get_mem(get_output_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_weights_layout(p))),
+        convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+        eltwise("eltwise1", "conv_prim", "eltwise_data", eltwise_mode::sum),
+        eltwise("eltwise2", "eltwise1", "conv_prim", eltwise_mode::prod),
+        reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
+    );
+    implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
+    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_2,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                             bc_test_params{CASE_CONV_FP32_2, 2, 4},
+                                             bc_test_params{CASE_CONV_FP32_3, 2, 4},
+                                             bc_test_params{CASE_CONV_FP32_4, 2, 4},
+
+                                             bc_test_params{CASE_CONV_FP16_2, 2, 4},
+                                             bc_test_params{CASE_CONV_FP16_3, 2, 4},
+                                             bc_test_params{CASE_CONV_FP16_4, 2, 4},
+                                             }), );
+
+
+class conv_fp32_multi_eltwise_2_clamp : public ConvFusingTest {};
+
+TEST_P(conv_fp32_multi_eltwise_2_clamp, basic) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("eltwise1_data", get_mem(get_output_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_weights_layout(p))),
+        convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+        eltwise("eltwise1", "conv_prim", "eltwise1_data", eltwise_mode::sum),
+        activation("activation", "eltwise1", activation_func::clamp, {0.5f, 2.5f}),
+        eltwise("eltwise2", "activation", "conv_prim", eltwise_mode::prod),
+        reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
+    );
+    implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
+    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_2_clamp,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                             bc_test_params{CASE_CONV_FP32_2, 2, 5},
+                                             bc_test_params{CASE_CONV_FP32_3, 2, 5},
+                                             bc_test_params{CASE_CONV_FP32_4, 2, 5},
+
+                                             bc_test_params{CASE_CONV_FP16_2, 2, 5},
+                                             bc_test_params{CASE_CONV_FP16_3, 2, 5},
+                                             bc_test_params{CASE_CONV_FP16_4, 2, 5},
+                                             }), );
+
+
+class conv_fp32_multi_eltwise_4_clamp : public ConvFusingTest {};
+
+TEST_P(conv_fp32_multi_eltwise_4_clamp, basic) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("eltwise1_data", get_mem(get_output_layout(p))),
+        data("eltwise2_data", get_mem(get_output_layout(p))),
+        data("eltwise4_data", get_mem(get_output_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_weights_layout(p))),
+        convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+        eltwise("eltwise1_add", "conv_prim", "eltwise1_data", eltwise_mode::sum),
+        activation("activation", "eltwise1_add", activation_func::clamp, {0.5f, 2.5f}),
+        eltwise("eltwise2_mul", "activation", "conv_prim", eltwise_mode::prod),
+        eltwise("eltwise3_div", "eltwise2_mul", "eltwise2_data", eltwise_mode::prod),
+        eltwise("eltwise4_add", "eltwise3_div", "eltwise4_data", eltwise_mode::sum),
+        reorder("reorder_bfyx", "eltwise4_add", p.default_format, data_types::f32)
+    );
+    implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
+    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_4_clamp,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                             bc_test_params{CASE_CONV_FP32_2, 2, 7},
+                                             bc_test_params{CASE_CONV_FP32_3, 2, 7},
+                                             bc_test_params{CASE_CONV_FP32_4, 2, 7},
+
+                                             bc_test_params{CASE_CONV_FP16_2, 2, 7},
+                                             bc_test_params{CASE_CONV_FP16_3, 2, 7},
+                                             bc_test_params{CASE_CONV_FP16_4, 2, 7},
+                                             }), );
+
+
+class conv_fp32_multi_eltwise_3_fusing : public ConvFusingTest {};
+TEST_P(conv_fp32_multi_eltwise_3_fusing, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("eltwise_data1", get_mem(get_output_layout(p))),
+        data("eltwise_data2", get_mem(get_output_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_weights_layout(p))),
+        convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+        eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
+        eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum),
+        eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod),
+        reorder("reorder_bfyx", "eltwise3", p.default_format, data_types::f32)
+    );
+    implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
+    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_3_fusing,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                             bc_test_params{CASE_CONV_FP32_2, 2, 5},
+                                             bc_test_params{CASE_CONV_FP32_3, 2, 5},
+                                             bc_test_params{CASE_CONV_FP32_4, 2, 5},
+
+                                             bc_test_params{CASE_CONV_FP16_2, 2, 5},
+                                             bc_test_params{CASE_CONV_FP16_3, 2, 5},
+                                             bc_test_params{CASE_CONV_FP16_4, 2, 5},
+                                             }), );
+
+
+
+class conv_fp32_multi_eltwise_quantization : public ConvFusingTest {};
+TEST_P(conv_fp32_multi_eltwise_quantization, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                        data("weights", get_mem(get_weights_layout(p))),
+                        data("bias", get_mem(get_bias_layout(p))),
+                        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+                        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+                        data("out_lo", get_mem(get_single_element_layout(p), -127)),
+                        data("out_hi", get_mem(get_single_element_layout(p), 127)),
+                        data("eltwise_data1", get_mem(get_output_layout(p))),
+                        convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+                        quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
+                        eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
+                        eltwise("eltwise2", "eltwise1", "quantize", eltwise_mode::prod),
+                        reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32)
+    );
+    tolerance = 1.f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_quantization,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                             bc_test_params{CASE_CONV_FP32_2, 4, 5},
+                                             bc_test_params{CASE_CONV_FP32_4, 4, 5},
+
+                                             bc_test_params{CASE_CONV_FP16_2, 4, 5},
+                                             bc_test_params{CASE_CONV_FP16_3, 4, 5},
+                                             bc_test_params{CASE_CONV_FP16_4, 4, 5},
+                                             }), );
+
+
+class conv_fp32_multi_eltwise_concat : public ConvFusingTest {};
+TEST_P(conv_fp32_multi_eltwise_concat, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("eltwise_data1", get_mem(get_output_layout(p))),
+        data("eltwise_data2", get_mem(get_output_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_weights_layout(p))),
+        convolution("conv_prim", "input", { "weights" }, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+        eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum),
+        eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum),
+        concatenation("concat",
+            {"eltwise1", "eltwise2"},
+            concatenation::concatenation_axis::along_f,
+            data_types::i8,
+            padding{{0, 0, 0, 0}, 0}),
+        reorder("reorder_bfyx", "concat", p.default_format, data_types::f32)
+    );
+    implementation_desc conv_impl = { format::b_fs_yx_fsv16, ""};
+    bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_multi_eltwise_concat,
+                        ::testing::ValuesIn(std::vector<bc_test_params>{
+                                             bc_test_params{CASE_CONV_FP32_2, 5, 5},
+                                             bc_test_params{CASE_CONV_FP32_3, 5, 5},
+                                             bc_test_params{CASE_CONV_FP32_4, 5, 5},
+
+                                             bc_test_params{CASE_CONV_FP16_2, 5, 5},
+                                             bc_test_params{CASE_CONV_FP16_3, 5, 5},
+                                             bc_test_params{CASE_CONV_FP16_4, 5, 5},
+                                             }), );
+
 class conv_fp32_eltwise_b_fs_zyx_fsv16 : public ConvFusingTest {};

 TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) {