[GPU] Code refactoring to choose between binary_add and sum (#10724)

+ Fix colorization-sig accuracy issue using oneDNN Memory crash in case reuse_eltwise_sum_post in oneDNN and memory_pool And print node in/out gpu_usm_mem addr at OV_GPU_Verbose >= 1 + Check the size of z spatial axis for checking fulltensor. + Remove program_helpers's functions. Co-authored-by: hyunback <hyunback.kim@intel.com>
2022-03-22 14:58:36 +09:00
parent e8288eb31d
commit a7df1531db
13 changed files with 356 additions and 249 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
@@ -34,6 +34,8 @@ struct memory {
    virtual void unlock(const stream& stream) = 0;
    virtual event::ptr fill(stream& stream, unsigned char pattern) = 0;
    virtual event::ptr fill(stream& stream) = 0;
+    // only supports gpu_usm
+    virtual void* buffer_ptr() const { return nullptr; }

    size_t size() const { return _bytes_count; }
    size_t count() const { return _layout.count(); }
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp
@@ -36,39 +36,26 @@ void basic_memory_dependencies::run(program& p) {
            add_memory_dependency(it, node);
        }

-        if (node->is_type<convolution>() && node->get_preferred_impl_type() == impl_types::onednn) {
-            auto& conv = node->as<convolution>();
-            bool can_reuse_eltwise_mem = false;
+        if (node->get_preferred_impl_type() == impl_types::onednn
+            && (node->is_type<convolution>() || node->is_type<deconvolution>())) {
            size_t eltw_dep = 0;
-
-            for (auto& fused_op : conv.get_fused_primitives()) {
+            for (auto& fused_op : node->get_fused_primitives()) {
                if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
-                    auto eltw_in_layout = conv.get_dependency(fused_op.dep_start_idx).get_output_layout();
-                    auto conv_out_layout = node->get_output_layout();
-                    if (eltw_dep > 0) {
-                        can_reuse_eltwise_mem = false;
-                        break;
-                    }
+                    // If it is first sum, reuse the buffer
+                    auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
+                    if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
+                        continue;

-                    if (eltw_in_layout.size == conv_out_layout.size &&
-                        eltw_in_layout.format == conv_out_layout.format &&
-                        eltw_in_layout.data_padding == conv_out_layout.data_padding &&
-                        data_type_traits::size_of(eltw_in_layout.data_type) == data_type_traits::size_of(conv_out_layout.data_type)) {
-                        eltw_dep = fused_op.dep_start_idx;
-                        can_reuse_eltwise_mem = true;
+                    eltw_dep = fused_op.dep_start_idx;
+                    auto& eltw_node = node->get_dependency(eltw_dep);
+                    eltw_node.can_share_buffer(false);
+                    node->can_share_buffer(false);
+                    for (auto& user : node->get_users()) {
+                        add_memory_dependency(user, &eltw_node);
+                        add_memory_dependency(user, node);
                    }
                }
            }
-
-            if (can_reuse_eltwise_mem) {
-                auto& eltw_node = conv.get_dependency(eltw_dep);
-                eltw_node.can_share_buffer(false);
-                conv.can_share_buffer(false);
-                for (auto& user : conv.get_users()) {
-                    add_memory_dependency(user, &eltw_node);
-                    add_memory_dependency(user, &conv);
-                }
-            }
        }

        // Note we iterate over processing order, it means if primitve has processing num greater than any of outputs,
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -93,16 +93,11 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
    for (auto& input : node.get_dependencies()) {
        if (input->get_preferred_impl_type() == impl_types::onednn) {
            for (auto& fused_op : input->get_fused_primitives()) {
-                if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
-                    auto& eltw_in = input->get_dependency(fused_op.dep_start_idx);
-                    auto eltw_in_layout = eltw_in.get_output_layout();
-                    auto out_layout = input->get_output_layout();
-
-                    if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout))
-                        continue;
-                    if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout))
-                        return false;
-                }
+                auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(*input, fused_op);
+                if (add_type == add_fusing_type::sum)
+                    return false;
+                else
+                    continue;
            }
            is_onednn_impl = true;
        }
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -649,44 +649,38 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
        // When the conv node is of onednn impl type and eltwise sum with full tensor is fused,
        // changes the input format of eltwise sum post-op to use binary add.
        if (conv_node.get_preferred_impl_type() == impl_types::onednn) {
-            std::vector<size_t> eltw_sum_dep_indices;
-            for (size_t i = 1; i < conv_node.get_dependencies().size(); i++) {
-                auto& dep = conv_node.get_dependency(i);
-                for (auto& fused_op : conv_node.get_fused_primitives()) {
-                    if (fused_op.node->is_type<eltwise>()
-                        && fused_op.node->as<eltwise>().get_primitive()->mode == eltwise_mode::sum
-                        && !program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(),
-                                conv_node.get_dependency(fused_op.dep_start_idx).get_output_layout())
-                        && conv_node.get_dependency(fused_op.dep_start_idx).get_users().size() == 1
-                        && conv_node.get_dependency(fused_op.dep_start_idx).id() == dep.id()) {
-                        eltw_sum_dep_indices.push_back(i);
-                    }
-                }
-            }
+            onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum,
+                [&](const program_node& p_node, const eltwise_node& e_node, const fused_primitive_desc& desc) {
+                    auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc);
+                    if (fusing_type == add_fusing_type::binary_per_tensor) {
+                        auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
+                        auto d_layout = dep_node.get_output_layout();
+                        auto d_format = d_layout.format;
+                        auto expected_format = format::any;

-            auto conv_layout = conv_node.get_output_layout();
-            for (auto& dep_id : eltw_sum_dep_indices) {
-                auto& prev_node = conv_node.get_dependency(dep_id);
-                auto old_layout = prev_node.get_output_layout();
-                auto expected_format = format::any;
-                if ((conv_layout.data_type == data_types::f16 || conv_layout.data_type == data_types::f32)
-                    && data_type_traits::is_i8_u8(old_layout.data_type)) {
-                    if (conv_layout.format == format::b_fs_yx_fsv16)
-                        expected_format = format::b_fs_yx_fsv32;
-                    if (conv_layout.format == format::bs_fs_yx_bsv32_fsv16)
-                        expected_format = format::bs_fs_yx_bsv32_fsv32;
-                }
+                        if (data_type_traits::is_i8_u8(d_layout.data_type)) {
+                            if (d_format == format::b_fs_yx_fsv16)
+                                expected_format = format::b_fs_yx_fsv32;
+                            else if (d_format == format::bs_fs_yx_bsv32_fsv16)
+                                expected_format = format::bs_fs_yx_bsv32_fsv32;
+                        } else if (data_type_traits::is_floating_point(d_layout.data_type)) {
+                            if (d_format == format::b_fs_yx_fsv32)
+                                expected_format = format::b_fs_yx_fsv16;
+                            else if (d_format == format::bs_fs_yx_bsv32_fsv32)
+                                expected_format = format::bs_fs_yx_bsv32_fsv16;
+                        }

-                if (expected_format != format::any && old_layout.format != expected_format) {
-                    auto new_layout = old_layout;
-                    new_layout.format = expected_format;
-                    auto new_input = rf.get_reorder(prev_node.id(), old_layout, new_layout);
-                    if (new_input.first) {
-                        p.add_intermediate(new_input.first, conv_node, dep_id, !new_input.second);
+                        if (expected_format != format::any && d_layout.format != expected_format) {
+                            auto new_layout = d_layout;
+                            new_layout.format = expected_format;
+                            auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout);
+                            if (new_input.first) {
+                                p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second);
+                            }
+                            conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false);
+                        }
                    }
-                    conv_node.get_dependency(dep_id).set_output_layout(new_layout, false);
-                }
-            }
+                });
        }
    };

--- a/src/plugins/intel_gpu/src/graph/include/program_helpers.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_helpers.h
@@ -11,10 +11,12 @@
 #include "intel_gpu/graph/program.hpp"
 #include "data_inst.h"
 #include "eltwise_inst.h"
+#include "convolution_inst.h"

 #include <string>
 #include <vector>
 #include <utility>
+#include <iostream>

 namespace cldnn {
 struct program_helpers {
@@ -125,12 +127,35 @@ struct program_helpers {
        }
    }
    static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
-
-    static bool are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout);
-
-    static bool needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout);
 };

+struct onednn_add_fusing_helpers {
+    enum class add_fusing_type {
+        sum,
+        binary_per_tensor,
+        binary_per_oc,
+        not_supported,
+    };
+
+    static bool is_full_tensor(const layout& layout);
+    static std::vector<fused_primitive_desc> get_fused_eltwise_primitives();
+    static void for_eltwise(const program_node& conv_node, eltwise_mode mode,
+                            std::function<void(const program_node&, const eltwise_node&, const fused_primitive_desc&)> func);
+    static add_fusing_type get_add_fusing_type(const program_node& node, const fused_primitive_desc& desc);
+};
+
+using add_fusing_type = onednn_add_fusing_helpers::add_fusing_type;
+
+static inline std::ostream& operator<< (std::ostream& os, add_fusing_type& t) {
+    switch (t) {
+        case add_fusing_type::sum: os << "sum"; break;
+        case add_fusing_type::binary_per_tensor: os << "binary_per_tensor"; break;
+        case add_fusing_type::binary_per_oc: os << "binary_per_oc"; break;
+        default: os << "not_supported"; break;
+    }
+    return os;
+}
+
 // Base class for performing pattern match style optimizations.
 // Uses CRTP idiom, implementing class should be passed as template parameter `Impl`,
 // and overload match and optimize methods.
--- a/src/plugins/intel_gpu/src/graph/include/program_node.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_node.h
@@ -56,6 +56,30 @@ enum class onednn_post_op_type : uint32_t {
    optimized_sum
 };

+static inline std::ostream& operator<< (std::ostream& os, onednn_post_op_type& t) {
+    switch (t) {
+        case onednn_post_op_type::eltwise_act: os << "eltwise_act"; break;
+        case onednn_post_op_type::eltwise_clip: os << "eltwise_clip"; break;
+        case onednn_post_op_type::eltwise_linear: os << "eltwise_linear"; break;
+        case onednn_post_op_type::eltwise_round: os << "eltwise_round"; break;
+        case onednn_post_op_type::binary_mul: os << "binary_mul"; break;
+        case onednn_post_op_type::binary_add: os << "binary_add"; break;
+        case onednn_post_op_type::binary_max: os << "binary_max"; break;
+        case onednn_post_op_type::binary_min: os << "binary_min"; break;
+        case onednn_post_op_type::binary_relu: os << "binary_relu"; break;
+        case onednn_post_op_type::scale: os << "scale"; break;
+        case onednn_post_op_type::sum: os << "sum"; break;
+        case onednn_post_op_type::optimized: os << "optimized"; break;
+        case onednn_post_op_type::optimized_eltwise_act: os << "optimized_eltwise_act"; break;
+        case onednn_post_op_type::optimized_eltwise_clip: os << "optimized_eltwise_clip"; break;
+        case onednn_post_op_type::optimized_eltwise_linear: os << "optimized_eltwise_linear"; break;
+        case onednn_post_op_type::optimized_eltwise_round: os << "optimized_eltwise_round"; break;
+        case onednn_post_op_type::optimized_sum: os << "optimized_sum"; break;
+        default: os << "invalid";
+    }
+    return os;
+}
+
 struct fused_primitive_desc_onednn {
    onednn_post_op_type op_type; // onednn post-operation type
    size_t mem_offset;           // index of a memory buffer for current post-operation
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -198,12 +198,11 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
    // Not to fuse reorder if this removal changes input format of its next node which has reuse in fused_op
    if (next.get_preferred_impl_type() == impl_types::onednn) {
        for (auto& fused_op : next.get_fused_primitives()) {
-            if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
+            if (fused_op.node->is_type<eltwise>()) {
                auto eltw_in_layout = next.get_dependency(fused_op.dep_start_idx).get_output_layout();
                auto out_layout = next.get_output_layout();
-                if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout) &&
-                    program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout) &&
-                    prev.get_output_layout().format != out_layout.format)
+                auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(next, fused_op);
+                if (add_type == add_fusing_type::sum && prev.get_output_layout().format != out_layout.format)
                    return false;
            }
        }
@@ -947,23 +946,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
    bool use_onednn_impls = _optimization_attributes.use_onednn_impls && input_layout.data_type != data_types::f32;
    bool i8_u8_input = input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8;

-    if (use_onednn_impls && onednn_valid_post_ops) {
-        for (auto& fo : node.get_fused_primitives()) {
-            if (fo.node->is_type<eltwise>()) {
-                auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
-                auto out_layout = node.get_output_layout();
-                auto in_dt = in_layout.data_type;
-                auto out_dt = out_layout.data_type;
-                if ((out_layout.count() == in_layout.count()) &&
-                    (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
-                    program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
-                    onednn_valid_post_ops = false;
-                    break;
-                }
-            }
-        }
-    }
-
    if (use_onednn_impls && onednn_valid_post_ops) {
        std::function<bool(const program_node&)> has_any_convolutions_below;
        has_any_convolutions_below = [&](const program_node& node) -> bool {
@@ -1373,23 +1355,6 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
            impl_candidate = impl_types::ocl;
        }

-        // [WA] to avoid an onednn kernel issue of multiple sum post-ops
-        if (!node.get_fused_primitives().empty()) {
-            size_t sum_post_op_cnt = 0;
-            for (auto& fused_op : node.get_fused_primitives()) {
-                if (fused_op.node->is_type<eltwise>() && node.get_dependencies().size() > fused_op.dep_start_idx && fused_op.deps.size() == 1)  {
-                    auto& eltw_in = node.get_dependency(fused_op.dep_start_idx);
-                    if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in.get_output_layout(), node.get_output_layout()) &&
-                        program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in.get_output_layout())) {
-                        if (sum_post_op_cnt > 0)
-                            return impl_types::ocl;
-
-                        sum_post_op_cnt += 1;
-                    }
-                }
-            }
-        }
-
        if (node.is_type<convolution>()) {
            // oneDNN doesn't have good support for groups with fsv16 fmt
            auto& conv = node.as<convolution>();
@@ -1418,29 +1383,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
            impl_candidate = impl_types::ocl;
        }

-        size_t eltw_dep = 0;
        for (auto& fo : node.get_fused_primitives()) {
-            if (fo.node->is_type<eltwise>()) {
-                auto in_layout = node.get_dependency(fo.dep_start_idx).get_output_layout();
-                auto out_layout = node.get_output_layout();
-                auto in_dt = in_layout.data_type;
-                auto out_dt = out_layout.data_type;
-                if (program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
-                    if ((out_layout.count() == in_layout.count()) &&
-                        (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt) {
-                        impl_candidate = impl_types::ocl;
-                        break;
-                    }
-                    if (in_layout.size == out_layout.size && in_layout.format == out_layout.format && in_layout.data_padding == out_layout.data_padding &&
-                        data_type_traits::size_of(in_dt) == data_type_traits::size_of(out_dt)) {
-                        if (eltw_dep > 0) {
-                            impl_candidate = impl_types::ocl;
-                            break;
-                        }
-                        eltw_dep = fo.dep_start_idx;
-                    }
-                }
-            } else if (fo.node->is_type<activation>()) {
+            if (fo.node->is_type<activation>()) {
                // Some activations aren't implemented in oneDNN
                auto activation_prim = fo.node->as<activation>().get_primitive();
                if (activation_prim->activation_function == activation_func::negative ||
@@ -1486,15 +1430,17 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
                    auto out_layout = node.get_output_layout();
                    auto in_dt = in_layout.data_type;
                    auto out_dt = out_layout.data_type;
-                    if ((out_layout.count() == in_layout.count()) &&
-                        (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt)) && in_dt != out_dt &&
-                        program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
+                    // if it is not eltwise sum and input is full tensor
+                    if ((out_layout.count() == in_layout.count()) && in_dt != out_dt
+                        && (data_type_traits::is_floating_point(in_dt) || data_type_traits::is_floating_point(out_dt))
+                        && onednn_add_fusing_helpers::is_full_tensor(in_layout)) {
                        impl_candidate = impl_types::ocl;
                        break;
                    }

-                    if (fo.node->as<eltwise>().get_primitive()->mode == eltwise_mode::sum &&
-                        program_helpers::needs_onednn_sum_post_op(fo.node->as<eltwise>(), in_layout)) {
+                    // WA: onednn sum/binary_add post-op are not supported due to perf drop.
+                    auto add_type = onednn_add_fusing_helpers::get_add_fusing_type(node, fo);
+                    if (add_type == add_fusing_type::sum || add_type == add_fusing_type::binary_per_tensor || add_type == add_fusing_type::binary_per_oc) {
                        impl_candidate = impl_types::ocl;
                        break;
                    }
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -536,54 +536,22 @@ void network::allocate_primitives() {

    for (auto const& node : _program->get_processing_order()) {
        if (node->get_preferred_impl_type() == impl_types::onednn) {
-            bool can_reuse_eltwise_mem = false;
            size_t eltw_dep = 0;
-
            for (auto& fused_op : node->get_fused_primitives()) {
                if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
-                    auto& eltw_in = node->get_dependency(fused_op.dep_start_idx);
-                    auto eltw_in_layout = eltw_in.get_output_layout();
-                    auto out_layout = node->get_output_layout();
-
-                    if (!program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout))
+                    // If it is first sum, reuse the buffer
+                    auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
+                    if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
                        continue;
-
-                    if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout)) {
-                        if (eltw_dep > 0)
-                            throw std::runtime_error("Unsupported multiple full size tensors.");
-
-                        eltw_dep = fused_op.dep_start_idx;
-                        can_reuse_eltwise_mem = true;
+                    eltw_dep = fused_op.dep_start_idx;
+                    auto& eltw_in = node->get_dependency(eltw_dep);
+                    if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
+                        auto& eltw_inst = _primitives.at(eltw_in.id());
+                        auto& prim_inst = _primitives.at(node->id());
+                        auto& eltw_mem = eltw_inst->output_memory();
+                        auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
+                        prim_inst->set_output_memory(new_mem);
                    }
-
-                    if (!can_reuse_eltwise_mem) {
-                        if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
-                            auto& eltw_inst = _primitives.at(eltw_in.id());
-                            auto& prim_inst = _primitives.at(node->id());
-                            auto eltw_mem_type = eltw_inst->output_memory().get_allocation_type();
-                            auto prim_mem_type = prim_inst->output_memory().get_allocation_type();
-
-                            // Keep lockable memory type for `prim_inst` output if needed
-                            if (eltw_mem_type != prim_mem_type && eltw_mem_type != allocation_type::cl_mem && eltw_mem_type != allocation_type::usm_host)
-                                can_reuse_eltwise_mem = false;
-                        }
-                    }
-
-                    if (program_helpers::needs_onednn_sum_post_op(fused_op.node->as<eltwise>(), eltw_in_layout) && !can_reuse_eltwise_mem) {
-                        throw std::runtime_error("Buffer reuse is required for onednn sum post operation.");
-                    }
-                }
-            }
-
-            if (can_reuse_eltwise_mem) {
-                auto& eltw_in = node->get_dependency(eltw_dep);
-                if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
-                    auto& eltw_inst = _primitives.at(eltw_in.id());
-                    auto& prim_inst = _primitives.at(node->id());
-                    auto& eltw_mem = eltw_inst->output_memory();
-                    auto new_mem = eltw_mem.get_engine()->reinterpret_buffer(eltw_mem, node->get_output_layout());
-
-                    prim_inst->set_output_memory(new_mem);
                }
            }
        }
@@ -698,8 +666,21 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
        }

        GPU_DEBUG_IF(debug_config->verbose >= 1) {
+            std::ostringstream in_addr;
+            // buffer_ptr() only support usm_memory
+            for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
+                auto& in_mem = get_primitive(inst->id())->dep_memory(i);
+                in_addr << in_mem.buffer_ptr();
+                if (i < get_primitive(inst->id())->dependencies().size() - 1) {
+                    in_addr << ", ";
+                }
+            }
+            auto& out_mem = get_primitive(inst->id())->output_memory();
+
            GPU_DEBUG_COUT << "Execute " << inst->id() << ", memory type: "
-                           << inst->output_memory().get_allocation_type() << std::endl;
+                           << inst->output_memory().get_allocation_type() << ", in_usm("
+                           << in_addr.str() << "), out_usm("
+                           << out_mem.buffer_ptr() << ")" << std::endl;
        }

        // If a node has mutable input or it's an output, then the input/output buffers might be changed
--- a/src/plugins/intel_gpu/src/graph/program_helpers.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_helpers.cpp
@@ -7,9 +7,11 @@
 #include "program_helpers.h"
 #include "intel_gpu/graph/program.hpp"
 #include "data_inst.h"
+#include "pooling_inst.h"
 #include <algorithm>
 #include <utility>
 #include <vector>
+#include <sstream>

 namespace cldnn {
 // helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
@@ -181,28 +183,52 @@ std::pair<bool, bool> program_helpers::are_layouts_identical(layout const& l1, l
    return {false, false};
 }

-// check if input and output layouts are identical to reuse memory in fused_ops of onednn
-bool program_helpers::are_layouts_identical_for_onednn_sum_post_op(layout input_layout, layout output_layout) {
-    if (input_layout.size == output_layout.size && input_layout.format == output_layout.format &&
-        input_layout.data_padding == output_layout.data_padding &&
-        data_type_traits::size_of(input_layout.data_type) == data_type_traits::size_of(output_layout.data_type))
-        return true;
-
-    return false;
-}
-
-bool program_helpers::needs_onednn_sum_post_op(const eltwise_node& n, layout input_layout) {
-    auto output_layout = n.get_output_layout();
-    if (n.get_primitive()->mode == eltwise_mode::sum &&
-        (input_layout.size.spatial[0] > 1 || input_layout.size.spatial[1] > 1 || input_layout.size.batch[0] > 1)
-        && output_layout.data_type == input_layout.data_type) {
+bool onednn_add_fusing_helpers::is_full_tensor(const layout& l) {
+    if (l.size.spatial[0] > 1 || l.size.spatial[1] > 1 || (l.get_spatial_rank() == 3 && l.size.spatial[2] > 1)
+        || l.size.batch[0] > 1) {
        return true;
    }
-
    return false;
 }

+void onednn_add_fusing_helpers::for_eltwise(
+    const program_node& node, eltwise_mode mode,
+    std::function<void(const program_node& p_node, const eltwise_node& e_node,
+                    const fused_primitive_desc& desc)> func) {
+    for (auto& fo : node.get_fused_primitives()) {
+        if (fo.node->is_type<eltwise>() && fo.node->as<eltwise>().get_primitive()->mode == mode) {
+            func(node, fo.node->as<eltwise>(), fo);
+        }
+    }
+}

+add_fusing_type onednn_add_fusing_helpers::get_add_fusing_type(
+    const program_node& p_node, const fused_primitive_desc& desc) {
+    if (!desc.node->is_type<eltwise>() || desc.node->as<eltwise>().get_primitive()->mode != eltwise_mode::sum) {
+        return add_fusing_type::not_supported;
+    }
+
+    auto& eltw_node = desc.node->as<eltwise>();
+    auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
+
+    auto p_layout = p_node.get_output_layout();
+    auto e_layout = eltw_node.get_output_layout();
+    auto d_layout = dep_node.get_output_layout();
+
+    if (is_full_tensor(p_layout) && is_full_tensor(d_layout)) {
+        if (data_type_traits::size_of(p_layout.data_type) == data_type_traits::size_of(d_layout.data_type)
+            && p_layout.format == d_layout.format && p_layout.size == d_layout.size
+            && p_layout.data_padding == d_layout.data_padding
+            && dep_node.get_users().size() == 1
+            && !p_node.is_type<pooling>()) {
+            return add_fusing_type::sum;
+        } else if (p_layout.size == d_layout.size) {
+            return add_fusing_type::binary_per_tensor;
+        }
+    }
+
+    return add_fusing_type::binary_per_oc;
+}


 }  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -7,6 +7,7 @@
 #include "primitive_inst.h"
 #include "loop_inst.h"
 #ifdef ENABLE_ONEDNN_FOR_GPU
+#include "intel_gpu/runtime/debug_configuration.hpp"
 #include "convolution_inst.h"
 #include "quantize_inst.h"
 #include "reorder_inst.h"
@@ -345,6 +346,8 @@ bool program_node::has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& a

 dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr,
                                                   bool& optimization_is_completed) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+
    // Create new dnnl::post_ops object which will be filled inside the optimization process
    dnnl::post_ops optimized_p_ops;

@@ -393,6 +396,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
                float scale;
                dnnl::memory::data_type data_type;
                cur_p_ops.get_params_sum(idx, scale, data_type);
+                // Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it.
                if (is_type<convolution>()) {
                    new_p_ops.append_sum(scale, data_type);
                } else {
@@ -419,7 +423,8 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const

    // Check that post-op type is any optimized
    auto type_is_any_optimized = [](onednn_post_op_type type) -> bool {
-        return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum ||
+        return type == onednn_post_op_type::optimized ||
+               type == onednn_post_op_type::optimized_sum ||
               type == onednn_post_op_type::optimized_eltwise_act ||
               type == onednn_post_op_type::optimized_eltwise_linear ||
               type == onednn_post_op_type::optimized_eltwise_clip ||
@@ -462,20 +467,45 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
        }
    };

+    auto remove_optimized_prefix = [&](std::vector<fused_primitive_desc_onednn>& post_ops) {
+        // Check and update post-op map if we already optimized something
+        auto iter = post_ops.begin();
+        while (iter != post_ops.end()) {
+            if (type_is_optimized_sum(iter->op_type)) {
+                iter->op_type = onednn_post_op_type::sum;
+                ++iter;
+            } else if (type_is_optimized_eltwise(iter->op_type)) {
+                iter->op_type = get_eltwise_type(iter->op_type);
+                ++iter;
+            } else if (type_is_optimized(iter->op_type)) {
+                iter = post_ops.erase(iter);
+            } else {
+                ++iter;
+            }
+        }
+    };
+
    auto& cur_post_ops = get_fused_primitives_onednn();

    size_t cur_post_op_idx = 1;
    size_t prev_post_op_idx = 0;
    bool optimization_done = false;

-    // Check and update post-op map if we already optimized something
-    for (size_t post_op_idx = 0; post_op_idx < cur_post_ops.size(); post_op_idx++) {
-        if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type))
-            cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum;
-        else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type))
-            cur_post_ops[post_op_idx].op_type = get_eltwise_type(cur_post_ops[post_op_idx].op_type);
-        else if (type_is_optimized(cur_post_ops[post_op_idx].op_type))
-            cur_post_ops.erase(cur_post_ops.begin() + post_op_idx);
+    GPU_DEBUG_IF(debug_config->verbose >= 3) {
+        GPU_DEBUG_COUT << "================================================" << std::endl;
+        GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
+        for (size_t i = 0; i < cur_post_ops.size(); i++)
+            GPU_DEBUG_COUT << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
+    }
+
+    remove_optimized_prefix(cur_post_ops);
+
+    GPU_DEBUG_IF(debug_config->verbose >= 3) {
+        GPU_DEBUG_COUT << "remove optimized prefix ------------------------" << std::endl;
+        GPU_DEBUG_COUT << " " << id() << ", num of post_ops " << p_ops.len() << std::endl;
+        for (size_t i = 0; i < cur_post_ops.size(); i++)
+            GPU_DEBUG_COUT << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
+        GPU_DEBUG_COUT << "----------------------------------->>>>>>>>>>>>>" << std::endl;
    }

    // Get post-ops size for current node
@@ -498,6 +528,9 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
        auto cur_type = cur_post_ops[cur_post_op_idx].op_type;
        auto prev_type = cur_post_ops[prev_post_op_idx].op_type;

+        GPU_DEBUG_IF(debug_config->verbose >= 3)
+            GPU_DEBUG_COUT << "before prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
+
        // Ignore optimized operations for "previous" operation in our operation pair
        while (type_is_any_optimized(prev_type) && prev_post_op_idx < post_ops_size - 1) {
            prev_post_op_idx++;
@@ -513,9 +546,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
            cur_type = cur_post_ops[cur_post_op_idx].op_type;
        }

+        GPU_DEBUG_IF(debug_config->verbose >= 3)
+            GPU_DEBUG_COUT << "after prev_post_op_idx: " << prev_post_op_idx << ", cur_post_op_idx: " << cur_post_op_idx << std::endl;
+
        auto cur_idx = static_cast<int>(has_out_scales(attr) ? (cur_post_op_idx >= 1 ? cur_post_op_idx - 1 : 0) : cur_post_op_idx);
        auto prev_idx = static_cast<int>(has_out_scales(attr) ? (prev_post_op_idx >= 1 ? prev_post_op_idx - 1 : 0) : prev_post_op_idx);

+        // if 2 indices are same, add the last post-op to dnnl::post_ops
+        if (prev_idx == post_ops_size - 1 && prev_idx == cur_idx && !type_is_any_optimized(prev_type)) {
+            add_post_op(prev_type, p_ops, optimized_p_ops, prev_idx);
+            break;
+        }
+
        // If this is the last pair and it's optimized - add the last post-op and go out from the cycle
        if (cur_post_op_idx == post_ops_size - 1 && (type_is_any_optimized(cur_type) || type_is_any_optimized(prev_type))) {
            if (!type_is_any_optimized(prev_type)) {
@@ -542,6 +584,11 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const

        bool cur_ops_pair_is_optimized = false;

+        GPU_DEBUG_IF(debug_config->verbose >= 3) {
+            GPU_DEBUG_COUT << "prev_idx: " << prev_idx << " " << prev_type
+                           << ", cur_idx: " << cur_idx << " " << cur_type << std::endl;
+        }
+
        if (can_try_optimize) {
            if (eltw_and_eltw) {
                dnnl::algorithm cur_alg, prev_alg;
@@ -701,6 +748,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
                    dnnl::post_ops eltw_p_op_prev, sum_p_op;

                    eltw_p_op_prev.append_eltwise(eltw_scale * next_alpha * next_scale, alg, alpha, beta);
+                    // Only conv supports data type specification in append_sum. Other primitives(deconv, fc) do not support it.
                    if (is_type<convolution>()) {
                        sum_p_op.append_sum(sum_scale * next_alpha, data_type);
                    } else {
@@ -769,7 +817,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
        }
    }

+    // if optimization_is_completed is true, try to optimize again.
    optimization_is_completed = !optimization_is_completed;
+    if (optimization_is_completed) {
+        remove_optimized_prefix(cur_post_ops);
+    }
+
+    GPU_DEBUG_IF(debug_config->verbose >= 3) {
+        GPU_DEBUG_COUT << ">>>>>>>>>>>>>-----------------------------------" << std::endl;
+        for (size_t i = 0; i < cur_post_ops.size(); i++)
+            GPU_DEBUG_COUT << "    " << i << ": " << cur_post_ops[i].op_type << std::endl;
+        GPU_DEBUG_COUT << "------------------------------------------------" << std::endl;
+    }

    add_onednn_fused_primitives(cur_post_ops);

@@ -805,6 +864,7 @@ void program_node::init_onednn_primitive_attributes() {
            memory_offset++;
    };

+    int32_t num_sum_post_ops = 0;
    for (size_t idx = 0; idx < cldnn_post_ops.size(); idx++) {
        auto node = cldnn_post_ops[idx].node;

@@ -834,13 +894,11 @@ void program_node::init_onednn_primitive_attributes() {
            auto in = get_dependency(dep_idx).get_output_layout();

            if (e_node.get_primitive()->mode == eltwise_mode::sum) {
-                if (program_helpers::needs_onednn_sum_post_op(e_node, in)) {
-                    if (is_type<convolution>()) {
-                        post_ops.append_sum(1.0f, onednn::convert_data_type(in.data_type));
-                    } else {
-                        post_ops.append_sum(1.0f);
-                    }
+                auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*this, cldnn_post_ops[idx]);
+                if (fusing_type == add_fusing_type::sum && num_sum_post_ops == 0) {
+                    post_ops.append_sum(1.0f);
                    update_onednn_post_op_list(onednn_post_op_type::sum, dep_idx);
+                    num_sum_post_ops++;
                } else {
                    dnnl::memory::desc in_desc = onednn::layout_to_memory_desc(in);
                    post_ops.append_binary(dnnl::algorithm::binary_add, in_desc);
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
@@ -107,6 +107,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
    void unlock(const stream& stream) override;
    const cl::UsmMemory& get_buffer() const { return _buffer; }
    cl::UsmMemory& get_buffer() { return _buffer; }
+    void* buffer_ptr() const override { return _buffer.get(); }

    event::ptr fill(stream& stream, unsigned char pattern) override;
    event::ptr fill(stream& stream) override;
--- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
+++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp
@@ -3410,68 +3410,130 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, ::test
    convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 },
 }));

+struct convolution_eltw_sum_test_params {
+    tensor in_shape;
+    tensor out_shape;
+    tensor kernel;
+    ov::Strides stride;
+    ov::CoordinateDiff pad;
+    ov::Strides dilation;
+    uint32_t groups;
+    data_types data_type;
+    format input_format;
+    data_types weights_type;
+    format weights_format;
+    data_types eltw_type;
+    format eltw_format;
+    data_types out_type;
+    format out_format;
+    data_types default_type;
+    format default_format;
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+};

-// input:b_fs_yx_fsv32:u8 X weight:bfyx:i8 + eltwise_sum:b_fs_yx_fsv32:u8
-// After optimization: eltwise_any + binary_add
-// DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_tanh+binary_add:u8:14:aBcd32b+eltwise_linear:1
-class post_ops_optimizations_onednn_binary_add_full_tensor : public WeightsPrimitiveFusingTestOneDNN {};
-TEST_P(post_ops_optimizations_onednn_binary_add_full_tensor, basic) {
+class EltwiseSumFusingTestOneDNN : public BaseFusingTest<convolution_eltw_sum_test_params> {
+public:
+    void execute(convolution_eltw_sum_test_params& p) {
+        auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p));
+
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+
+        compare(network_not_fused, network_fused, p);
+
+        auto pi_fused = network_fused.get_primitives_info();
+        auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), [](primitive_info& p) -> bool {
+            if (p.original_id == "conv_prim")
+                return true;
+            return false;
+        });
+
+        if (info_fused != pi_fused.end() && engine.get_device_info().supports_immad) {
+            std::cout << "kernel: " << info_fused->kernel_id << std::endl;
+            EXPECT_TRUE(info_fused->kernel_id.find("jit:ir") != std::string::npos);
+        }
+    }
+
+    layout get_input_layout(convolution_eltw_sum_test_params& p) {
+        auto pad = p.pad;
+        std::vector<int> pad_ = { 0, 0, static_cast<int>(pad[0]), static_cast<int>(pad[1]) };
+        return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
+    }
+
+    layout get_per_channel_layout(convolution_eltw_sum_test_params& p) {
+        return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } };
+    }
+};
+
+class onednn_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {};
+TEST_P(onednn_binary_add_full_tensor, basic) {
    auto p = GetParam();
+
    create_topologies(
        input_layout("input", get_input_layout(p)),
        data("weights", get_mem(get_weights_layout(p))),
        data("bias", get_mem(get_bias_layout(p))),
-        data("in_lo", get_mem(get_single_element_layout(p), 0)),
-        data("in_hi", get_mem(get_single_element_layout(p), 255)),
-        data("out_lo", get_mem(get_single_element_layout(p), 0)),
-        data("out_hi", get_mem(get_single_element_layout(p), 255)),
-        data("eltwise_data", get_mem(get_output_layout(p), 0, 255)),
-        convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
+        data("in_lo1", get_mem(get_single_element_layout(p), 0)),
+        data("in_hi1", get_mem(get_single_element_layout(p), 100)),
+        data("out_lo1", get_mem(get_single_element_layout(p), 0)),
+        data("out_hi1", get_mem(get_single_element_layout(p), 100)),
+        data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
+        convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false),
        activation("activation", "conv_prim", activation_func::hyperbolic_tan),
-        eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
-        quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
-        reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
+        quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type),
+        eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type),
+        reorder("reorder_bfyx", "sum", p.default_format, p.default_type)
    );

    tolerance = 1.f;
    execute(p);
 }

-// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format;
-#define CASE_CONV_U8S8_FT_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; eltw_type; eltw_format; out_type; out_format; default_type; default_format;
+#define CASE_CONV_ELTW_SUM_BINARY_ADD_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_CONV_ELTW_SUM_SUM_1        { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, { 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::u8, format::b_fs_yx_fsv32, data_types::i8, format::bfyx, data_types::u8, format::b_fs_yx_fsv32, data_types::u8, format::b_fs_yx_fsv32, data_types::f32, format::bfyx

-INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_test_params>{
-    // cases with batch = 1
-    convolution_test_params{ CASE_CONV_U8S8_FT_BINARY_ADD_1, 2, 5 },
+INSTANTIATE_TEST_SUITE_P(eltwise_sum_fusings_gpu, onednn_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
+    convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 5 },
+    convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 5 },
 }));


-// input:b_fs_yx_fsv16:f16 X weight:bfyx:f16 + eltwise_sum:b_fs_yx_fsv16:f16
-// After optimization: eltwise_any + sum
-// DNNL_VERBOSE log with optimization:    attr-post-ops:eltwise_tanh+sum:1:0:f16
-class post_ops_optimizations_onednn_sum_full_tensor : public WeightsPrimitiveFusingTestOneDNN {};
-TEST_P(post_ops_optimizations_onednn_sum_full_tensor, basic) {
+class onednn_multiple_binary_add_full_tensor : public EltwiseSumFusingTestOneDNN {};
+TEST_P(onednn_multiple_binary_add_full_tensor, basic) {
    auto p = GetParam();
+
    create_topologies(
        input_layout("input", get_input_layout(p)),
        data("weights", get_mem(get_weights_layout(p))),
        data("bias", get_mem(get_bias_layout(p))),
-        convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
+        data("in_lo1", get_mem(get_single_element_layout(p), 0)),
+        data("in_hi1", get_mem(get_single_element_layout(p), 100)),
+        data("out_lo1", get_mem(get_single_element_layout(p), 0)),
+        data("out_hi1", get_mem(get_single_element_layout(p), 100)),
+        data("eltwise_data", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
+        data("eltwise_data1", get_mem(layout{ p.eltw_type, p.eltw_format, p.out_shape }, 0, 100)),
+        data("eltwise_data2", get_mem(layout{ p.eltw_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }, 0, 100)),
+        convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, false),
        activation("activation", "conv_prim", activation_func::hyperbolic_tan),
-        data("eltwise_data", get_mem(get_output_layout(p), 0, 255)),
-        eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum),
-        reorder("reorder_bfyx", "sum", p.default_format, data_types::f32)
+        quantize("quantize1", "activation", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 256, p.out_type),
+        eltwise("sum", { "quantize1", "eltwise_data" }, eltwise_mode::sum, p.out_type), // eltwise sum with full tensor
+        eltwise("sum1", { "sum", "eltwise_data1" }, eltwise_mode::sum, p.out_type),     // eltwise sum with full tensor
+        eltwise("sum2", { "sum1", "eltwise_data2" }, eltwise_mode::sum, p.out_type),    // eltwise sum with broadcasting
+        reorder("reorder_bfyx", "sum2", p.default_format, p.default_type)
    );

    tolerance = 1.f;
    execute(p);
 }

-#define CASE_CONV_F16F16_FT_ELTW_SUM_1 { 1, 32, 4, 4 }, { 1, 16, 4, 4 }, tensor{ 1, 1, 3, 3 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx, data_types::f32, format::bfyx
-
-INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_sum_full_tensor, ::testing::ValuesIn(std::vector<convolution_test_params>{
-    // cases with batch = 1
-    convolution_test_params{ CASE_CONV_F16F16_FT_ELTW_SUM_1, 2, 4 },
+INSTANTIATE_TEST_SUITE_P(multiple_eltwise_sum_fusings_gpu, onednn_multiple_binary_add_full_tensor, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
+    convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_BINARY_ADD_1, 2, 7 },
+    convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_SUM_1, 2, 7 },
 }));

+
 #endif  // ENABLE_ONEDNN_FOR_GPU
--- a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp
+++ b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp
@@ -227,4 +227,10 @@ public:
        topology_fused.add(args...);
        topology_non_fused.add(args...);
    }
+
+    template <class... Args>
+    void add_topologies(Args const&... args) {
+        topology_fused.add(args...);
+        topology_non_fused.add(args...);
+    }
 };