[GPU] Update for layout query (#13346)

+ support multiple input and output target formats + implement generic logic for selection formats and find_data_format + Add TCs for select_preferred_formats Signed-off-by: Min, Byungil <byungil.min@intel.com>
2022-10-24 15:27:09 +09:00 · 2022-10-24 15:27:09 +09:00 · 4188f1f181
commit 4188f1f181
parent 385d87edaf
15 changed files with 323 additions and 179 deletions
--- a/src/plugins/intel_gpu/src/graph/convolution.cpp
+++ b/src/plugins/intel_gpu/src/graph/convolution.cpp
@ -174,8 +174,8 @@ layout convolution_inst::calc_output_layout(convolution_node const& node, kernel

    // Adjust output format for shallow conv and mixed precision cases in onednn
    auto out_fmt = input_layout.format;
-    if (node.get_preferred_impl_type() == impl_types::onednn && node.get_required_output() != format::any) {
-        out_fmt = node.get_required_output();
+    if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
+        out_fmt = node.get_preferred_output_fmt();
    }

    // get output feature map from weights. It should be the same as number of biases. Will be verifed in
--- a/src/plugins/intel_gpu/src/graph/deconvolution.cpp
+++ b/src/plugins/intel_gpu/src/graph/deconvolution.cpp
@ -42,8 +42,8 @@ layout deconvolution_inst::calc_output_layout(deconvolution_node const& node, ke
    int32_t number_of_features = weights_layout.group() * weights_layout.ofm();

    format out_fmt = input_layout.format;
-    if (node.get_preferred_impl_type() == impl_types::onednn && node.get_required_output() != format::any) {
-        out_fmt = node.get_required_output();
+    if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
+        out_fmt = node.get_preferred_output_fmt();
    }

    if (desc->with_output_size) {
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@ -336,8 +336,7 @@ void remove_redundant_reorders::run(program& p) {
            if (!same_data_type && !allowed_dt_conversion_fuse)
                continue;

-            auto next_node = node.get_users().empty() ? nullptr : node.get_users().front();
-            if (!lo.can_fuse_reorder_to_prev(input, next_node, input.get_output_layout().format, output_layout.format))
+            if (!lo.can_fuse_reorder_to_prev(input, node, input.get_output_layout().format, output_layout.format))
                continue;

            auto old_output_layout_of_input = input.get_output_layout();
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@ -372,11 +372,11 @@ void minimize_local_reorders(program& p, std::map<program_node*, format::type>&
    }
 }

-static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node) {
-    // 1. Check required_output
+static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, size_t user_idx = 0) {
+    // 1. Check selected preferred_output_format
    if (lo.get_optimization_attributes().use_onednn_impls) {
-        // If onednn is not used, need to ignore get_required_layout result as it is from onednn
-        auto ret = node->get_required_output();
+        // If onednn is not used, need to ignore get_preferred_output_fmt result as it is from onednn
+        auto ret = node->get_preferred_output_fmt(user_idx);

        if (ret != format::any)
            return ret;
@ -390,11 +390,11 @@ static format get_target_output_format(layout_optimizer& lo, const std::map<prog
    return node->get_output_layout().format;
 }

-static format get_target_input0_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node) {
-    // 1. Check required_input
+static format get_target_input_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, size_t dep_idx = 0) {
+    // 1. Check selected preferred_input_format
    if (lo.get_optimization_attributes().use_onednn_impls) {
-        // If onednn is not used, need to ignore get_required_layout result as it is from onednn
-        auto ret = node->get_required_input0();
+        // If onednn is not used, need to ignore get_preferred_input_fmt result as it is from onednn
+        auto ret = node->get_preferred_input_fmt(dep_idx);
        if (ret != format::any)
            return ret;
    }
@ -431,32 +431,18 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
        // We have three (potentially) conflicting information here for format
        //    node->get_output_layout().format : It is not up-to-date at this moment. It is just the default format (bfyx)
        //    fmt_map.at(node).format          : It is queried with get_preferred_layout. However, it has only output format.
-        //    node.get_required_input0/output  : If it is valid(!= any), it is up-to-date. It has input format, too.
-        // So the priority is required_input0/output --> fmt_map --> output_layout().format
-
+        //    node.get_preferred_output_fmt    : If it is valid(!= any), it is up-to-date.
+        // So the priority is preferred_input/output_format --> fmt_map --> output_layout().format
        auto predecessor = travel_direction_wrapper<dir>::first(node, next);
        auto successor = travel_direction_wrapper<dir>::second(node, next);
        auto in_layout = predecessor->get_output_layout();
        auto out_layout = in_layout;
-        in_layout.format = get_target_output_format(lo, fmt_map, predecessor);
-        auto target_input0_format = get_target_input0_format(lo, fmt_map, successor);
+        auto index_to_pred = successor->get_dependency_index(*predecessor);
+        auto index_to_succ = predecessor->get_user_index(*successor);

-        for (auto& fused_prim : successor->get_fused_primitives()) {
-            // If it is input of fused node, use output layout instead of input layout
-            if (successor->get_dependencies().size() <= fused_prim.dep_start_idx)
-                continue;
-            auto& dependency = successor->get_dependency(fused_prim.dep_start_idx);
-            if (&dependency == predecessor) {
-                target_input0_format = get_target_output_format(lo, fmt_map, successor);
-                GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                    GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ": Use output format of successor " << successor->id() << " : "
-                                << fmt_to_str(target_input0_format) << std::endl;
-                }
-                break;
-            }
-        }
+        in_layout.format = get_target_output_format(lo, fmt_map, predecessor, index_to_succ);
+        out_layout.format = get_target_input_format(lo, fmt_map, successor, index_to_pred);

-        out_layout.format = target_input0_format;
        GPU_DEBUG_IF(debug_config->verbose >= 2) {
            GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << "  " << node->id() << " --> " << next->id() << " ## "
                    << fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
@ -468,9 +454,9 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
        auto reorder_pair = rf.get_reorder(travel_direction_wrapper<dir>::first(node, next)->id(),
                                           in_layout,
                                           out_layout);
-        auto reorder = reorder_pair.first;

-        if (reorder) {
+        auto reorder = reorder_pair.first;
+        if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
            auto& reorder_node = p.get_or_create(reorder);
            GPU_DEBUG_IF(debug_config->verbose >= 2) {
                GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << "  " << reorder_node.id()
@ -687,10 +673,10 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
        // For supporting optimized onednn first conv, the input format from prev reorder to this conv is changed to a recommended format by onednn.
        auto& input = conv_node.input();
        auto input_layout = input.get_output_layout();
-
-        if (conv_node.impl_type == impl_types::onednn && input_layout.format != conv_node.get_required_input0()) {
+        if (conv_node.impl_type == impl_types::onednn && input_layout.format != conv_node.get_preferred_input_fmt()) {
+            // Data input format does NOT match with an output format of previous node
            auto new_layout = input_layout;
-            new_layout.format = conv_node.get_required_input0();
+            new_layout.format = conv_node.get_preferred_input_fmt();
            auto new_input = rf.get_reorder(input.id(), input_layout, new_layout);
            if (new_input.first)
                p.add_intermediate(new_input.first, conv_node, 0, !new_input.second);
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp
@ -0,0 +1,56 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "pass_manager.h"
+#include "data_inst.h"
+#include "mutable_data_inst.h"
+#include "program_node.h"
+#include "intel_gpu/runtime/engine.hpp"
+#include "runtime/cldnn_itt.hpp"
+#include <iostream>
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include <oneapi/dnnl/dnnl.hpp>
+#include "intel_gpu/runtime/debug_configuration.hpp"
+#include "impls/onednn/utils.hpp"
+#include "impls/onednn/convolution_onednn.hpp"
+#include "impls/onednn/deconvolution_onednn.hpp"
+#endif
+
+using namespace cldnn;
+
+void select_preferred_formats::run(program& p) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "pass::select_preferred_formats");
+
+    auto& engine = p.get_engine();
+    const auto& device_info = engine.get_device_info();
+
+    if (!device_info.supports_immad)
+        return;
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    for (auto n : p.get_processing_order()) {
+        // Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
+        try {
+            dnnl::primitive_desc prim_desc;
+            if (n->is_type<convolution>()) {
+                auto desc = onednn::get_convolution_descriptor(*n->get_kernel_impl_params(), dnnl::memory::format_tag::any);
+                prim_desc = dnnl::primitive_desc(&desc->data, nullptr, engine.get_onednn_engine(), nullptr);
+            } else if (n->is_type<deconvolution>()) {
+                auto desc = onednn::get_deconvolution_descriptor(*n->get_kernel_impl_params(), dnnl::memory::format_tag::any);
+                prim_desc = dnnl::primitive_desc(&desc->data, nullptr, engine.get_onednn_engine(), nullptr);
+            }
+
+            _lo.select_preferred_formats_for_onednn(*n, prim_desc);
+        } catch(std::exception &exception) {
+            GPU_DEBUG_IF(debug_config->verbose >= 1) {
+                std::cout << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
+            }
+        }
+    }
+#endif  // ENABLE_ONEDNN_FOR_GPU
+}
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/set_required_layouts.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/set_required_layouts.cpp
@ -1,82 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "pass_manager.h"
-#include "data_inst.h"
-#include "mutable_data_inst.h"
-#include "program_node.h"
-#include "intel_gpu/runtime/engine.hpp"
-#include "runtime/cldnn_itt.hpp"
-#include <iostream>
-#include "to_string_utils.h"
-#include "intel_gpu/runtime/debug_configuration.hpp"
-#ifdef ENABLE_ONEDNN_FOR_GPU
-#include <oneapi/dnnl/dnnl.hpp>
-#include "impls/onednn/utils.hpp"
-#include "impls/onednn/convolution_onednn.hpp"
-#include "impls/onednn/deconvolution_onednn.hpp"
-#endif
-
-using namespace cldnn;
-
-#ifdef ENABLE_ONEDNN_FOR_GPU
-static dnnl::primitive_desc get_convolution_prim_desc(cldnn::engine& engine, program_node& n) {
-    auto desc = onednn::get_convolution_descriptor(*n.get_kernel_impl_params(), dnnl::memory::format_tag::any);
-    // Note: did not handle attribute properly. especially for zero-point
-    dnnl::primitive_desc prim_desc{&desc->data, nullptr, engine.get_onednn_engine(), nullptr};
-    return prim_desc;
-}
-
-static dnnl::primitive_desc get_deconvolution_prim_desc(cldnn::engine& engine, program_node& n) {
-    auto desc = onednn::get_deconvolution_descriptor(*n.get_kernel_impl_params(), dnnl::memory::format_tag::any);
-    dnnl::primitive_desc prim_desc{&desc->data, nullptr, engine.get_onednn_engine(), nullptr};
-    return prim_desc;
-}
-#endif
-
-void set_required_layouts::run(program& p) {
-    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "CLDNN::pass::SetRequiredLayouts");
-
-    auto& engine = p.get_engine();
-    const auto& device_info = engine.get_device_info();
-
-    if (!device_info.supports_immad)
-        return;
-
-#ifdef ENABLE_ONEDNN_FOR_GPU
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    for (auto n : p.get_processing_order()) {
-        if (!(n->is_type<convolution>() || n->is_type<deconvolution>())
-            || !layout_optimizer::are_data_types_suitable_for_onednn(*n)) {
-            // only care for onednn convolutions
-            continue;
-        }
-
-        // Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
-        try {
-            dnnl::primitive_desc prim_desc;
-            if (n->is_type<convolution>()) {
-                prim_desc = get_convolution_prim_desc(engine, *n);
-            } else if (n->is_type<deconvolution>()) {
-                prim_desc = get_deconvolution_prim_desc(engine, *n);
-            }
-
-            auto src_fmt = onednn::find_data_format(prim_desc.src_desc());
-            auto dst_fmt = onednn::find_data_format(prim_desc.dst_desc());
-            GPU_DEBUG_GET_INSTANCE(debug_config);
-            GPU_DEBUG_IF(debug_config->verbose >= 2) {
-                std::cout << "set_required_layouts:" << n->id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt) << std::endl;
-            }
-            n->set_required_input0(src_fmt);
-            n->set_required_output(dst_fmt);
-        } catch(std::exception &exception) {
-            GPU_DEBUG_IF(debug_config->verbose >= 1) {
-                std::cout << "WARNING(set_required_layouts): " << exception.what() << std::endl;
-            }
-        }
-    }
-#endif
-}
--- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
@ -5,6 +5,7 @@
 #include "utils.hpp"
 #include "onednn_formats_map.hpp"
 #include <oneapi/dnnl/dnnl_debug.h>
+#include <numeric>
 #include <oneapi/dnnl/dnnl_ocl.hpp>

 #include "to_string_utils.h"
@ -147,7 +148,6 @@ dnnl::memory::format_tag convert_data_format(cldnn::format fmt) {
    return ret->first;
 }

-
 std::string convert_data_format_string(cldnn::format fmt) {
    switch (fmt) {
        case cldnn::format::b_fs_yx_fsv2: return "aBcd2b";
@ -370,6 +370,23 @@ dnnl::memory::format_tag get_format_by_desc(dnnl::memory::desc desc) {
    return dnnl::memory::format_tag::undef;
 }

+static std::vector<size_t> get_order(dnnl::memory::desc desc) {
+    auto blk = desc.data.format_desc.blocking;
+    auto strides = blk.strides;
+    std::vector<size_t> order(desc.data.ndims);
+
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(),
+                [&strides] (size_t ind_l, size_t ind_r) {
+                    return (strides[ind_l] > strides[ind_r]);
+                });
+    return order;
+}
+
+static bool compare_strides(std::vector<size_t> a, std::vector<size_t> b) {
+    return std::equal(a.begin(), a.end(), b.begin());
+}
+
 cldnn::format find_data_format(dnnl::memory::desc desc) {
    auto onednn_desc = get_format_by_desc(desc);

@ -377,26 +394,26 @@ cldnn::format find_data_format(dnnl::memory::desc desc) {
        return convert_data_format(onednn_desc);
    } else {
        auto blk = desc.data.format_desc.blocking;
-        if (desc.data.ndims == 5 && blk.inner_nblks == 1
-                    && blk.inner_blks[0] == 2
-                    && blk.inner_idxs[0] == 1) {
-                    return cldnn::format::b_fs_zyx_fsv2;
-        }
-        if (desc.data.ndims == 4 && blk.inner_nblks == 1
-                    && blk.inner_blks[0] == 2
-                    && blk.inner_idxs[0] == 1) {
-                    return cldnn::format::b_fs_yx_fsv2;
-        }
-        if (desc.data.ndims == 4 && blk.inner_nblks == 2
-                    && blk.inner_blks[0] == 16 && blk.inner_blks[1] == 32
-                    && blk.inner_idxs[0] == 0 && blk.inner_idxs[1] == 1) {
-                    return cldnn::format::bs_fs_yx_bsv16_fsv32;
-        }
-        if (desc.data.ndims == 5 && blk.inner_nblks == 2
-                    && blk.inner_blks[0] == 16 && blk.inner_blks[1] == 32
-                    && blk.inner_idxs[0] == 0 && blk.inner_idxs[1] == 1) {
-                    return cldnn::format::bs_fs_zyx_bsv16_fsv32;
+        auto order = get_order(desc);
+        for (int32_t fmt_idx = format::bfyx ; fmt_idx < format::format_num ; fmt_idx++) {
+            auto candidate_trait = format::traits(static_cast<format::type>(fmt_idx));
+            if (desc.data.ndims == static_cast<int>(candidate_trait._order.size())
+                && blk.inner_nblks == static_cast<int>(candidate_trait.block_sizes.size())
+                && compare_strides(order, candidate_trait._order)) {
+                bool is_match = true;
+                for (size_t idx = 0 ; idx < candidate_trait.block_sizes.size() ; idx++) {
+                    if (blk.inner_blks[idx] != static_cast<int>(candidate_trait.block_sizes[idx].second)
+                        || blk.inner_idxs[idx] != static_cast<int>(candidate_trait.block_sizes[idx].first)) {
+                        is_match = false;
+                        break;
+                    }
+                }
+
+                if (is_match)
+                    return static_cast<format::type>(fmt_idx);
+            }
        }
+
        std::stringstream msg;
        msg << "Unsupported onednn dnnl::memory::desc find_data_format. "
            << "ndims: " << desc.data.ndims
@ -471,19 +488,7 @@ cldnn::format find_format(dnnl::memory::desc desc, bool is_grouped) {
        return convert_format(onednn_desc, is_grouped);
    } else {
        auto blk = desc.data.format_desc.blocking;
-
-        auto strides = blk.strides;
-        std::vector<size_t> order(desc.data.ndims);
-        std::iota(order.begin(), order.end(), 0);
-        std::sort(order.begin(), order.end(),
-                  [&strides] (size_t ind_l, size_t ind_r) {
-                      return strides[ind_l] > strides[ind_r];
-                  });
-
-        auto compare_strides = [](std::vector<size_t> &a, std::vector<size_t> b) -> bool {
-            return std::equal(a.begin(), a.end(), b.begin());
-        };
-
+        auto order = get_order(desc);
        if (is_grouped) {
            if (desc.data.ndims == 5 && blk.inner_nblks == 3
                && blk.inner_blks[0] == 8 && blk.inner_blks[1] == 8 && blk.inner_blks[2] == 2
--- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
+++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
@ -196,7 +196,7 @@ public:
    // Returns whether reorder between "prev" with format fmt_prev and "next" with format fmt_next
    // can be fused into next.
    bool can_fuse_reorder(program_node& prev, program_node& next, format fmt_prev, format fmt_next);
-    bool can_fuse_reorder_to_prev(program_node& prev, program_node* next, format fmt_prev, format fmt_next);
+    bool can_fuse_reorder_to_prev(program_node& prev, reorder_node& target_node, format fmt_prev, format fmt_next);

    void set_optimization_attribute(optimization_attributes_type attribute, int32_t val);
    optimization_attributes get_optimization_attributes() { return _optimization_attributes; }
@ -210,5 +210,9 @@ public:
    size_t get_total_conv_count();

    bool should_select_b_fs_yx_fsv16_layout(convolution_node const& node, layout const& output_or_weights_layout);
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    void select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc);
+#endif
 };
 }  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h
+++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@ -310,12 +310,14 @@ private:
    reorder_factory& _rf;
 };

-class set_required_layouts : public base_pass {
+class select_preferred_formats : public base_pass {
 public:
-    set_required_layouts() : base_pass("set_required_layouts") {}
+    explicit select_preferred_formats(layout_optimizer& lo_ref) :
+        base_pass("select_preferred_formats"), _lo(lo_ref) {}

 private:
    void run(program& p) override;
+    layout_optimizer& _lo;
 };

 class trim_to_outputs : public base_pass {
--- a/src/plugins/intel_gpu/src/graph/include/program_node.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_node.h
@ -198,6 +198,9 @@ public:
    void remove_dependency(size_t idx);
    void remove_dependency(program_node& node);

+    size_t get_dependency_index(program_node& node) const;
+    size_t get_user_index(program_node& node) const;
+
    std::set<primitive_id> get_memory_dependencies() const;
    void add_memory_dependency(primitive_id);
    void add_memory_dependency(std::vector<primitive_id>);
@ -419,10 +422,18 @@ public:
        cur_id = 0;
    }

-    format::type get_required_input0() const { return required_input0; }
-    format::type get_required_output() const { return required_output; }
-    void set_required_input0(format::type type) { required_input0 = type; }
-    void set_required_output(format::type type) { required_output = type; }
+    std::vector<format::type> get_preferred_input_fmts() const { return preferred_input_fmts; }
+    std::vector<format::type> get_preferred_output_fmts() const { return preferred_output_fmts; }
+    format::type get_preferred_input_fmt(size_t idx = 0) const {
+        return (idx < preferred_input_fmts.size()) ? preferred_input_fmts.at(idx) : format::any;
+    }
+    format::type get_preferred_output_fmt(size_t idx = 0) const {
+        return (idx < preferred_output_fmts.size()) ? preferred_output_fmts.at(idx) : format::any;
+    }
+
+    void init_preferred_fmt(size_t dep_size, size_t user_size);
+    void set_preferred_input_fmt(size_t idx, format::type type);
+    void set_preferred_output_fmt(size_t idx, format::type type);


 protected:
@ -437,8 +448,8 @@ protected:
    bool valid_output_layout = false;
    layout output_layout = layout(data_types::f32, format::bfyx, tensor());

-    format::type required_input0;
-    format::type required_output;
+    std::vector<format::type> preferred_input_fmts;
+    std::vector<format::type> preferred_output_fmts;

    std::vector<program_node*> dependencies;
    std::vector<std::pair<program_node*, int>> dependencies_new;
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@ -15,6 +15,7 @@
 #include <sstream>

 #include "gemm_inst.h"
+#include "deconvolution_inst.h"
 #include "eltwise_inst.h"
 #include "pooling_inst.h"
 #include "reduce_inst.h"
@ -25,10 +26,16 @@
 #include "depth_to_space_inst.h"
 #include "region_yolo_inst.h"
 #include "prior_box_inst.h"
+#include "to_string_utils.h"
 #include <vector>
 #include <memory>
 #include <utility>

+#ifdef ENABLE_ONEDNN_FOR_GPU
+#include <oneapi/dnnl/dnnl.hpp>
+#include "impls/onednn/utils.hpp"
+#endif
+
 using namespace cldnn;

 static size_t get_post_ops_count(const program_node& node) {
@ -230,8 +237,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        return true;

    // Do not remove reorder if it is necessary to fulfill required_input
-    auto reorder_layout = next.get_dependency(0).get_output_layout();
-    if (reorder_layout.format == next.get_required_input0()
+    auto& reorder_node = next.get_dependency(0);
+    auto reorder_layout = reorder_node.get_output_layout();
+    if (reorder_layout.format == next.get_preferred_input_fmt(next.get_dependency_index(reorder_node))
            && !reorder_layout.data_padding)
        return false;

@ -332,7 +340,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,

        // Remove Reorder for Convolution if mixed layout.
        auto& node = prev.get_users().front();
-        if (prev.get_output_layout().format == next.get_required_input0() &&
+        if (prev.get_output_layout().format == next.get_preferred_input_fmt() &&
                node->get_output_layout().data_padding == prev.get_output_layout().data_padding)
            return true;

@ -377,17 +385,18 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
    return false;
 }

-bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node* next, format fmt_prev, format fmt_next) {
-    if (prev.is_dynamic() || (next && next->is_dynamic()))
+bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node& node, format fmt_prev, format fmt_next) {
+    if (prev.is_dynamic() || (!node.get_users().empty() && node.get_users().front()->is_dynamic()))
        return false;

    // Ref kernels are the main for depth_to_space, region_yolo and detection_output. It can do anything. Should not see next.
    if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>() || prev.is_type<detection_output>())
        return true;

-    if (next == nullptr)
+    if (node.get_users().empty())
        return false;

+    auto next = node.get_users().front();
    auto dt_prev = prev.get_output_layout().data_type;
    auto dt_next = next->get_output_layout().data_type;
    auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
@ -436,8 +445,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node

    // Remove Reorder after convolution if possible.
    if (use_onednn_impls) {
-        auto reorder_layout = next->get_dependency(0).get_output_layout();
-        if (reorder_layout.format == prev.get_required_output() &&
+        auto reorder_layout = node.get_output_layout();
+        if (reorder_layout.format == prev.get_preferred_output_fmt() &&
                reorder_layout.data_padding == prev.get_output_layout().data_padding)
            return true;

@ -1029,7 +1038,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
    bool i8_u8_input = input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8;

    if (use_onednn_impls && onednn_valid_post_ops) {
-        expected_format = node.get_required_output();
+        expected_format = node.get_preferred_output_fmt();
    } else {
        /* *************************** Native impls format selection part ************************** */
        if (use_onednn_impls && i8_u8_input) {
@ -1123,7 +1132,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,

    if (use_onednn_impls && is_node_for_onednn(node)) {
        // XXX: need to take the situation into consideration where it is called from prepare_primitive_fusing
-        expected_format = node.get_required_output();
+        expected_format = node.get_preferred_output_fmt();
    } else if (_optimization_attributes.b_fs_zyx_fsv16_network &&
        deconvolution_b_fs_zyx_fsv16_opt(current_layout, output_or_weights_layout, prim)) {
        if ((current_layout.data_type == data_types::f32 && expected_tensor.batch[0] % 16 == 0) ||
@ -1633,8 +1642,9 @@ format layout_optimizer::get_preferred_format(program_node& node) {
        };

        if (use_onednn_impls) {
-            if (node.get_users().front()->get_required_input0() != format::any) {
-                expected = node.get_users().front()->get_required_input0();
+            auto& user = node.get_users().front();
+            if (user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) {
+                expected = user->get_preferred_input_fmt(user->get_dependency_index(node));
            } else {
                expected = format::any;
            }
@ -1738,6 +1748,53 @@ format layout_optimizer::get_preferred_format(program_node& node) {
    return expected;
 }

+#ifdef ENABLE_ONEDNN_FOR_GPU
+void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    if (node.is_input() || !are_data_types_suitable_for_onednn(node)) {
+        return;
+    }
+
+    node.init_preferred_fmt(node.get_dependencies().size(), node.get_users().size());
+    if (node.is_type<convolution>() || node.is_type<deconvolution>()) {
+        for (size_t idx = 0 ; idx < node.get_dependencies().size() ; idx++) {
+            if (node.get_dependency(idx).is_constant())
+                continue;
+
+            // Conv or deconv gets a preferred format for its data input based on source memory description
+            // But an input format for fused post-ops should be same with an output format of conv/deconv
+            size_t prim_input;
+            if (node.is_type<convolution>())
+                prim_input = node.get_dependency_index(node.as<convolution>().input());
+            if (node.is_type<deconvolution>())
+                prim_input = node.get_dependency_index(node.as<deconvolution>().input());
+
+            // Note: did not handle attribute properly. especially for zero-point
+            cldnn::format src_fmt = format::any;
+            if (idx == prim_input)
+                src_fmt = onednn::find_data_format(prim_desc.src_desc());
+            else  // Dep for fused post ops
+                src_fmt = onednn::find_data_format(prim_desc.dst_desc());
+
+            node.set_preferred_input_fmt(idx, src_fmt);
+
+            auto dst_fmt = onednn::find_data_format(prim_desc.dst_desc());
+            if (node.get_preferred_output_fmt() == format::any) {
+                for (size_t usr = 0 ; usr < node.get_users().size() ; usr++)
+                    node.set_preferred_output_fmt(usr, dst_fmt);
+            }
+
+            GPU_DEBUG_IF(debug_config->verbose >= 2) {
+                std::cout << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
+                 << " For index : " << idx << std::endl;
+            }
+        }
+    }
+
+    return;
+}
+#endif  // ENABLE_ONEDNN_FOR_GPU
+
 bool layout_optimizer::all_users_simple_format_until_output(program_node& origin_node, program_node& cur_node, int32_t cur_depth, int32_t max_depth) {
    if (cur_node.is_output()) return true;
    if (cur_depth > max_depth) return false;
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -543,7 +543,7 @@ void program::pre_optimize_graph(bool is_internal) {

        apply_opt_pass<prepare_primitive_fusing>(lo);

-        apply_opt_pass<set_required_layouts>();
+        apply_opt_pass<select_preferred_formats>(lo);

        apply_opt_pass<reorder_inputs>(lo, rf);
        // Ideally this should be done before fusing to simplify logic and make the pass more powerful,
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@ -31,7 +31,7 @@ using namespace cldnn;
 thread_local size_t program_node::cur_id = 0;

 program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
-    : desc(prim), myprog(prog), required_input0(format::any), required_output(format::any), org_id(prim ? (prim->id) : 0) {
+    : desc(prim), myprog(prog), preferred_input_fmts({}), preferred_output_fmts({}), org_id(prim ? (prim->id) : 0) {
    if (prim) {
        output_layout.data_padding = prim->output_padding;
        num_outputs = prim->num_outputs;
@ -211,6 +211,26 @@ void program_node::remove_dependency(program_node& node) {
            remove_dependency(i);
 }

+size_t program_node::get_user_index(program_node& node) const {
+    size_t idx = 0;
+    for (auto& user : users) {
+        if (user == &node)
+            return idx;
+        else
+            idx++;
+    }
+
+    OPENVINO_ASSERT(false, "Search invalid user node" + node.id() + " node");
+}
+
+size_t program_node::get_dependency_index(program_node& node) const {
+    for (size_t i = 0; i < dependencies.size(); ++i)
+        if (dependencies[i] == &node)
+            return i;
+
+    OPENVINO_ASSERT(false, "Search invalid dependency node" + node.id() + " node");
+}
+
 bool program_node::is_detached(bool whole_branch) {
    if (!users.empty())
        return false;
@ -353,7 +373,7 @@ std::map<size_t, memory::ptr> program_node::get_const_memory_deps() const {
 void program_node::invalidate_users() const {
    for (auto& user : users) {
        if (user->valid_output_layout) {
-            if (user->get_required_output() != format::any)
+            if (user->get_preferred_output_fmt() != format::any)
                continue;
            user->valid_output_layout = false;
            user->invalidate_users();
@ -407,6 +427,25 @@ bool program_node::need_lockable_memory() const {
    return need_lockable_mem;
 }

+void program_node::init_preferred_fmt(size_t dep_node, size_t user_node) {
+    preferred_input_fmts.resize(dep_node, format::any);
+    preferred_output_fmts.resize(user_node, format::any);
+}
+
+void program_node::set_preferred_input_fmt(size_t idx, format::type type) {
+    if (idx >= preferred_input_fmts.size())
+        preferred_input_fmts.resize(idx+1, format::any);
+
+    preferred_input_fmts.at(idx) = type;
+}
+
+void program_node::set_preferred_output_fmt(size_t idx, format::type type) {
+    if (idx >= preferred_output_fmts.size())
+        preferred_output_fmts.resize(idx+1, format::any);
+
+    preferred_output_fmts.at(idx) = type;
+}
+
    /* ----------------------------------------- */
    /* Onednn fused operations integration logic */
    /* ----------------------------------------- */
@ -1254,4 +1293,5 @@ void program_node::init_onednn_primitive_attributes() {
    add_onednn_attrs(attrs);
 }

+
 #endif // ENABLE_ONEDNN_FOR_GPU
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@ -121,7 +121,8 @@ static void print_help_messages() {
                              " Supported on only on linux.");
    message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
    message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
-                              "For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:ocl and reduce:onednn are supported");
+                              " For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:onednn, reduce:ocl, concat:onednn,"
+                              " and concat:ocl are supported");
    message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");

    auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
--- a/src/plugins/intel_gpu/tests/passes/select_preferred_formats_test.cpp
+++ b/src/plugins/intel_gpu/tests/passes/select_preferred_formats_test.cpp
@ -0,0 +1,65 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include "intel_gpu/runtime/engine.hpp"
+
+#include "intel_gpu/primitives/convolution.hpp"
+#include "intel_gpu/graph/program.hpp"
+#include "data_inst.h"
+#include "convolution_inst.h"
+#include "intel_gpu/graph/network.hpp"
+#include "pass_manager.h"
+#include "to_string_utils.h"
+
+#include "program_wrapper.h"
+
+#include <memory>
+
+using namespace cldnn;
+using namespace ::tests;
+using namespace testing;
+
+TEST(test_select_preferred_formats, setting_target_conv_format) {
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 32, 64, 64 } });
+    auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 32, 64, 64 } });
+
+    topology topology;
+    topology.add(data("weights", weights));
+    topology.add(input_layout("input", input->get_layout()));
+    topology.add(reorder("reorder", "input", format::b_fs_yx_fsv16, data_types::f16)),
+    topology.add(convolution("conv1", "reorder", { "weights" }));
+
+    build_options build;
+    build.set_option(build_option::allow_new_shape_infer(true));
+    implementation_desc impl = { format::b_fs_yx_fsv16, std::string(""), impl_types::onednn };
+    build.set_option(build_option::force_implementations({ {"conv1", impl} }));
+
+    layout_optimizer lo(true);
+    auto prog = program::build_program(engine, topology, build, false, true);
+
+    program_wrapper::apply_opt_pass<select_preferred_formats>(*prog, lo);
+
+    ASSERT_NE(prog, nullptr);
+
+    auto itr = prog->get_processing_order().begin();
+    while (itr != prog->get_processing_order().end()) {
+        auto node_ptr = *itr++;
+        if (!node_ptr->is_type<convolution>())
+            continue;
+
+        auto& node = node_ptr->as<convolution>();
+        auto input_fmt = node.get_preferred_input_fmt(0);
+        auto output_fmt = node.get_preferred_output_fmt(0);
+        if (engine.get_device_info().supports_immad) {
+            ASSERT_EQ(input_fmt, format::b_fs_yx_fsv16);
+            ASSERT_EQ(output_fmt, format::b_fs_yx_fsv16);
+        } else {
+            ASSERT_EQ(input_fmt, format::any);
+            ASSERT_EQ(output_fmt, format::any);            
+        }
+    }
+}