[GPU] Update for layout query (#13346)
+ support multiple input and output target formats + implement generic logic for selection formats and find_data_format + Add TCs for select_preferred_formats Signed-off-by: Min, Byungil <byungil.min@intel.com>
This commit is contained in:
parent
385d87edaf
commit
4188f1f181
@ -174,8 +174,8 @@ layout convolution_inst::calc_output_layout(convolution_node const& node, kernel
|
||||
|
||||
// Adjust output format for shallow conv and mixed precision cases in onednn
|
||||
auto out_fmt = input_layout.format;
|
||||
if (node.get_preferred_impl_type() == impl_types::onednn && node.get_required_output() != format::any) {
|
||||
out_fmt = node.get_required_output();
|
||||
if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
|
||||
out_fmt = node.get_preferred_output_fmt();
|
||||
}
|
||||
|
||||
// get output feature map from weights. It should be the same as number of biases. Will be verifed in
|
||||
|
@ -42,8 +42,8 @@ layout deconvolution_inst::calc_output_layout(deconvolution_node const& node, ke
|
||||
int32_t number_of_features = weights_layout.group() * weights_layout.ofm();
|
||||
|
||||
format out_fmt = input_layout.format;
|
||||
if (node.get_preferred_impl_type() == impl_types::onednn && node.get_required_output() != format::any) {
|
||||
out_fmt = node.get_required_output();
|
||||
if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
|
||||
out_fmt = node.get_preferred_output_fmt();
|
||||
}
|
||||
|
||||
if (desc->with_output_size) {
|
||||
|
@ -336,8 +336,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
if (!same_data_type && !allowed_dt_conversion_fuse)
|
||||
continue;
|
||||
|
||||
auto next_node = node.get_users().empty() ? nullptr : node.get_users().front();
|
||||
if (!lo.can_fuse_reorder_to_prev(input, next_node, input.get_output_layout().format, output_layout.format))
|
||||
if (!lo.can_fuse_reorder_to_prev(input, node, input.get_output_layout().format, output_layout.format))
|
||||
continue;
|
||||
|
||||
auto old_output_layout_of_input = input.get_output_layout();
|
||||
|
@ -372,11 +372,11 @@ void minimize_local_reorders(program& p, std::map<program_node*, format::type>&
|
||||
}
|
||||
}
|
||||
|
||||
static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node) {
|
||||
// 1. Check required_output
|
||||
static format get_target_output_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, size_t user_idx = 0) {
|
||||
// 1. Check selected preferred_output_format
|
||||
if (lo.get_optimization_attributes().use_onednn_impls) {
|
||||
// If onednn is not used, need to ignore get_required_layout result as it is from onednn
|
||||
auto ret = node->get_required_output();
|
||||
// If onednn is not used, need to ignore get_preferred_output_fmt result as it is from onednn
|
||||
auto ret = node->get_preferred_output_fmt(user_idx);
|
||||
|
||||
if (ret != format::any)
|
||||
return ret;
|
||||
@ -390,11 +390,11 @@ static format get_target_output_format(layout_optimizer& lo, const std::map<prog
|
||||
return node->get_output_layout().format;
|
||||
}
|
||||
|
||||
static format get_target_input0_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node) {
|
||||
// 1. Check required_input
|
||||
static format get_target_input_format(layout_optimizer& lo, const std::map<program_node*, format::type>& fmt_map, program_node *node, size_t dep_idx = 0) {
|
||||
// 1. Check selected preferred_input_format
|
||||
if (lo.get_optimization_attributes().use_onednn_impls) {
|
||||
// If onednn is not used, need to ignore get_required_layout result as it is from onednn
|
||||
auto ret = node->get_required_input0();
|
||||
// If onednn is not used, need to ignore get_preferred_input_fmt result as it is from onednn
|
||||
auto ret = node->get_preferred_input_fmt(dep_idx);
|
||||
if (ret != format::any)
|
||||
return ret;
|
||||
}
|
||||
@ -431,32 +431,18 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
|
||||
// We have three (potentially) conflicting information here for format
|
||||
// node->get_output_layout().format : It is not up-to-date at this moment. It is just the default format (bfyx)
|
||||
// fmt_map.at(node).format : It is queried with get_preferred_layout. However, it has only output format.
|
||||
// node.get_required_input0/output : If it is valid(!= any), it is up-to-date. It has input format, too.
|
||||
// So the priority is required_input0/output --> fmt_map --> output_layout().format
|
||||
|
||||
// node.get_preferred_output_fmt : If it is valid(!= any), it is up-to-date.
|
||||
// So the priority is preferred_input/output_format --> fmt_map --> output_layout().format
|
||||
auto predecessor = travel_direction_wrapper<dir>::first(node, next);
|
||||
auto successor = travel_direction_wrapper<dir>::second(node, next);
|
||||
auto in_layout = predecessor->get_output_layout();
|
||||
auto out_layout = in_layout;
|
||||
in_layout.format = get_target_output_format(lo, fmt_map, predecessor);
|
||||
auto target_input0_format = get_target_input0_format(lo, fmt_map, successor);
|
||||
auto index_to_pred = successor->get_dependency_index(*predecessor);
|
||||
auto index_to_succ = predecessor->get_user_index(*successor);
|
||||
|
||||
for (auto& fused_prim : successor->get_fused_primitives()) {
|
||||
// If it is input of fused node, use output layout instead of input layout
|
||||
if (successor->get_dependencies().size() <= fused_prim.dep_start_idx)
|
||||
continue;
|
||||
auto& dependency = successor->get_dependency(fused_prim.dep_start_idx);
|
||||
if (&dependency == predecessor) {
|
||||
target_input0_format = get_target_output_format(lo, fmt_map, successor);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ": Use output format of successor " << successor->id() << " : "
|
||||
<< fmt_to_str(target_input0_format) << std::endl;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
in_layout.format = get_target_output_format(lo, fmt_map, predecessor, index_to_succ);
|
||||
out_layout.format = get_target_input_format(lo, fmt_map, successor, index_to_pred);
|
||||
|
||||
out_layout.format = target_input0_format;
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << node->id() << " --> " << next->id() << " ## "
|
||||
<< fmt_to_str(in_layout.format) << " --> " << fmt_to_str(out_layout.format) << std::endl;
|
||||
@ -468,9 +454,9 @@ void insert_reorders_in_dir(program& p, const std::map<program_node*, format::ty
|
||||
auto reorder_pair = rf.get_reorder(travel_direction_wrapper<dir>::first(node, next)->id(),
|
||||
in_layout,
|
||||
out_layout);
|
||||
auto reorder = reorder_pair.first;
|
||||
|
||||
if (reorder) {
|
||||
auto reorder = reorder_pair.first;
|
||||
if (reorder && (in_layout.format != format::any && out_layout.format != format::any)) {
|
||||
auto& reorder_node = p.get_or_create(reorder);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
GPU_DEBUG_COUT << __func__ << ":" << __LINE__ << ":" << dir_msg(dir) << " " << reorder_node.id()
|
||||
@ -687,10 +673,10 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
// For supporting optimized onednn first conv, the input format from prev reorder to this conv is changed to a recommended format by onednn.
|
||||
auto& input = conv_node.input();
|
||||
auto input_layout = input.get_output_layout();
|
||||
|
||||
if (conv_node.impl_type == impl_types::onednn && input_layout.format != conv_node.get_required_input0()) {
|
||||
if (conv_node.impl_type == impl_types::onednn && input_layout.format != conv_node.get_preferred_input_fmt()) {
|
||||
// Data input format does NOT match with an output format of previous node
|
||||
auto new_layout = input_layout;
|
||||
new_layout.format = conv_node.get_required_input0();
|
||||
new_layout.format = conv_node.get_preferred_input_fmt();
|
||||
auto new_input = rf.get_reorder(input.id(), input_layout, new_layout);
|
||||
if (new_input.first)
|
||||
p.add_intermediate(new_input.first, conv_node, 0, !new_input.second);
|
||||
|
@ -0,0 +1,56 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "pass_manager.h"
|
||||
#include "data_inst.h"
|
||||
#include "mutable_data_inst.h"
|
||||
#include "program_node.h"
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "runtime/cldnn_itt.hpp"
|
||||
#include <iostream>
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
#include <oneapi/dnnl/dnnl.hpp>
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "impls/onednn/utils.hpp"
|
||||
#include "impls/onednn/convolution_onednn.hpp"
|
||||
#include "impls/onednn/deconvolution_onednn.hpp"
|
||||
#endif
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
void select_preferred_formats::run(program& p) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "pass::select_preferred_formats");
|
||||
|
||||
auto& engine = p.get_engine();
|
||||
const auto& device_info = engine.get_device_info();
|
||||
|
||||
if (!device_info.supports_immad)
|
||||
return;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
for (auto n : p.get_processing_order()) {
|
||||
// Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
|
||||
try {
|
||||
dnnl::primitive_desc prim_desc;
|
||||
if (n->is_type<convolution>()) {
|
||||
auto desc = onednn::get_convolution_descriptor(*n->get_kernel_impl_params(), dnnl::memory::format_tag::any);
|
||||
prim_desc = dnnl::primitive_desc(&desc->data, nullptr, engine.get_onednn_engine(), nullptr);
|
||||
} else if (n->is_type<deconvolution>()) {
|
||||
auto desc = onednn::get_deconvolution_descriptor(*n->get_kernel_impl_params(), dnnl::memory::format_tag::any);
|
||||
prim_desc = dnnl::primitive_desc(&desc->data, nullptr, engine.get_onednn_engine(), nullptr);
|
||||
}
|
||||
|
||||
_lo.select_preferred_formats_for_onednn(*n, prim_desc);
|
||||
} catch(std::exception &exception) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
std::cout << "WARNING(select_preferred_formats): " << exception.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
}
|
@ -1,82 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "pass_manager.h"
|
||||
#include "data_inst.h"
|
||||
#include "mutable_data_inst.h"
|
||||
#include "program_node.h"
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "runtime/cldnn_itt.hpp"
|
||||
#include <iostream>
|
||||
#include "to_string_utils.h"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
#include <oneapi/dnnl/dnnl.hpp>
|
||||
#include "impls/onednn/utils.hpp"
|
||||
#include "impls/onednn/convolution_onednn.hpp"
|
||||
#include "impls/onednn/deconvolution_onednn.hpp"
|
||||
#endif
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
static dnnl::primitive_desc get_convolution_prim_desc(cldnn::engine& engine, program_node& n) {
|
||||
auto desc = onednn::get_convolution_descriptor(*n.get_kernel_impl_params(), dnnl::memory::format_tag::any);
|
||||
// Note: did not handle attribute properly. especially for zero-point
|
||||
dnnl::primitive_desc prim_desc{&desc->data, nullptr, engine.get_onednn_engine(), nullptr};
|
||||
return prim_desc;
|
||||
}
|
||||
|
||||
static dnnl::primitive_desc get_deconvolution_prim_desc(cldnn::engine& engine, program_node& n) {
|
||||
auto desc = onednn::get_deconvolution_descriptor(*n.get_kernel_impl_params(), dnnl::memory::format_tag::any);
|
||||
dnnl::primitive_desc prim_desc{&desc->data, nullptr, engine.get_onednn_engine(), nullptr};
|
||||
return prim_desc;
|
||||
}
|
||||
#endif
|
||||
|
||||
void set_required_layouts::run(program& p) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "CLDNN::pass::SetRequiredLayouts");
|
||||
|
||||
auto& engine = p.get_engine();
|
||||
const auto& device_info = engine.get_device_info();
|
||||
|
||||
if (!device_info.supports_immad)
|
||||
return;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
for (auto n : p.get_processing_order()) {
|
||||
if (!(n->is_type<convolution>() || n->is_type<deconvolution>())
|
||||
|| !layout_optimizer::are_data_types_suitable_for_onednn(*n)) {
|
||||
// only care for onednn convolutions
|
||||
continue;
|
||||
}
|
||||
|
||||
// Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
|
||||
try {
|
||||
dnnl::primitive_desc prim_desc;
|
||||
if (n->is_type<convolution>()) {
|
||||
prim_desc = get_convolution_prim_desc(engine, *n);
|
||||
} else if (n->is_type<deconvolution>()) {
|
||||
prim_desc = get_deconvolution_prim_desc(engine, *n);
|
||||
}
|
||||
|
||||
auto src_fmt = onednn::find_data_format(prim_desc.src_desc());
|
||||
auto dst_fmt = onednn::find_data_format(prim_desc.dst_desc());
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
std::cout << "set_required_layouts:" << n->id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt) << std::endl;
|
||||
}
|
||||
n->set_required_input0(src_fmt);
|
||||
n->set_required_output(dst_fmt);
|
||||
} catch(std::exception &exception) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 1) {
|
||||
std::cout << "WARNING(set_required_layouts): " << exception.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
@ -5,6 +5,7 @@
|
||||
#include "utils.hpp"
|
||||
#include "onednn_formats_map.hpp"
|
||||
#include <oneapi/dnnl/dnnl_debug.h>
|
||||
#include <numeric>
|
||||
#include <oneapi/dnnl/dnnl_ocl.hpp>
|
||||
|
||||
#include "to_string_utils.h"
|
||||
@ -147,7 +148,6 @@ dnnl::memory::format_tag convert_data_format(cldnn::format fmt) {
|
||||
return ret->first;
|
||||
}
|
||||
|
||||
|
||||
std::string convert_data_format_string(cldnn::format fmt) {
|
||||
switch (fmt) {
|
||||
case cldnn::format::b_fs_yx_fsv2: return "aBcd2b";
|
||||
@ -370,6 +370,23 @@ dnnl::memory::format_tag get_format_by_desc(dnnl::memory::desc desc) {
|
||||
return dnnl::memory::format_tag::undef;
|
||||
}
|
||||
|
||||
static std::vector<size_t> get_order(dnnl::memory::desc desc) {
|
||||
auto blk = desc.data.format_desc.blocking;
|
||||
auto strides = blk.strides;
|
||||
std::vector<size_t> order(desc.data.ndims);
|
||||
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
std::sort(order.begin(), order.end(),
|
||||
[&strides] (size_t ind_l, size_t ind_r) {
|
||||
return (strides[ind_l] > strides[ind_r]);
|
||||
});
|
||||
return order;
|
||||
}
|
||||
|
||||
static bool compare_strides(std::vector<size_t> a, std::vector<size_t> b) {
|
||||
return std::equal(a.begin(), a.end(), b.begin());
|
||||
}
|
||||
|
||||
cldnn::format find_data_format(dnnl::memory::desc desc) {
|
||||
auto onednn_desc = get_format_by_desc(desc);
|
||||
|
||||
@ -377,26 +394,26 @@ cldnn::format find_data_format(dnnl::memory::desc desc) {
|
||||
return convert_data_format(onednn_desc);
|
||||
} else {
|
||||
auto blk = desc.data.format_desc.blocking;
|
||||
if (desc.data.ndims == 5 && blk.inner_nblks == 1
|
||||
&& blk.inner_blks[0] == 2
|
||||
&& blk.inner_idxs[0] == 1) {
|
||||
return cldnn::format::b_fs_zyx_fsv2;
|
||||
}
|
||||
if (desc.data.ndims == 4 && blk.inner_nblks == 1
|
||||
&& blk.inner_blks[0] == 2
|
||||
&& blk.inner_idxs[0] == 1) {
|
||||
return cldnn::format::b_fs_yx_fsv2;
|
||||
}
|
||||
if (desc.data.ndims == 4 && blk.inner_nblks == 2
|
||||
&& blk.inner_blks[0] == 16 && blk.inner_blks[1] == 32
|
||||
&& blk.inner_idxs[0] == 0 && blk.inner_idxs[1] == 1) {
|
||||
return cldnn::format::bs_fs_yx_bsv16_fsv32;
|
||||
}
|
||||
if (desc.data.ndims == 5 && blk.inner_nblks == 2
|
||||
&& blk.inner_blks[0] == 16 && blk.inner_blks[1] == 32
|
||||
&& blk.inner_idxs[0] == 0 && blk.inner_idxs[1] == 1) {
|
||||
return cldnn::format::bs_fs_zyx_bsv16_fsv32;
|
||||
auto order = get_order(desc);
|
||||
for (int32_t fmt_idx = format::bfyx ; fmt_idx < format::format_num ; fmt_idx++) {
|
||||
auto candidate_trait = format::traits(static_cast<format::type>(fmt_idx));
|
||||
if (desc.data.ndims == static_cast<int>(candidate_trait._order.size())
|
||||
&& blk.inner_nblks == static_cast<int>(candidate_trait.block_sizes.size())
|
||||
&& compare_strides(order, candidate_trait._order)) {
|
||||
bool is_match = true;
|
||||
for (size_t idx = 0 ; idx < candidate_trait.block_sizes.size() ; idx++) {
|
||||
if (blk.inner_blks[idx] != static_cast<int>(candidate_trait.block_sizes[idx].second)
|
||||
|| blk.inner_idxs[idx] != static_cast<int>(candidate_trait.block_sizes[idx].first)) {
|
||||
is_match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_match)
|
||||
return static_cast<format::type>(fmt_idx);
|
||||
}
|
||||
}
|
||||
|
||||
std::stringstream msg;
|
||||
msg << "Unsupported onednn dnnl::memory::desc find_data_format. "
|
||||
<< "ndims: " << desc.data.ndims
|
||||
@ -471,19 +488,7 @@ cldnn::format find_format(dnnl::memory::desc desc, bool is_grouped) {
|
||||
return convert_format(onednn_desc, is_grouped);
|
||||
} else {
|
||||
auto blk = desc.data.format_desc.blocking;
|
||||
|
||||
auto strides = blk.strides;
|
||||
std::vector<size_t> order(desc.data.ndims);
|
||||
std::iota(order.begin(), order.end(), 0);
|
||||
std::sort(order.begin(), order.end(),
|
||||
[&strides] (size_t ind_l, size_t ind_r) {
|
||||
return strides[ind_l] > strides[ind_r];
|
||||
});
|
||||
|
||||
auto compare_strides = [](std::vector<size_t> &a, std::vector<size_t> b) -> bool {
|
||||
return std::equal(a.begin(), a.end(), b.begin());
|
||||
};
|
||||
|
||||
auto order = get_order(desc);
|
||||
if (is_grouped) {
|
||||
if (desc.data.ndims == 5 && blk.inner_nblks == 3
|
||||
&& blk.inner_blks[0] == 8 && blk.inner_blks[1] == 8 && blk.inner_blks[2] == 2
|
||||
|
@ -196,7 +196,7 @@ public:
|
||||
// Returns whether reorder between "prev" with format fmt_prev and "next" with format fmt_next
|
||||
// can be fused into next.
|
||||
bool can_fuse_reorder(program_node& prev, program_node& next, format fmt_prev, format fmt_next);
|
||||
bool can_fuse_reorder_to_prev(program_node& prev, program_node* next, format fmt_prev, format fmt_next);
|
||||
bool can_fuse_reorder_to_prev(program_node& prev, reorder_node& target_node, format fmt_prev, format fmt_next);
|
||||
|
||||
void set_optimization_attribute(optimization_attributes_type attribute, int32_t val);
|
||||
optimization_attributes get_optimization_attributes() { return _optimization_attributes; }
|
||||
@ -210,5 +210,9 @@ public:
|
||||
size_t get_total_conv_count();
|
||||
|
||||
bool should_select_b_fs_yx_fsv16_layout(convolution_node const& node, layout const& output_or_weights_layout);
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
void select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc);
|
||||
#endif
|
||||
};
|
||||
} // namespace cldnn
|
||||
|
@ -310,12 +310,14 @@ private:
|
||||
reorder_factory& _rf;
|
||||
};
|
||||
|
||||
class set_required_layouts : public base_pass {
|
||||
class select_preferred_formats : public base_pass {
|
||||
public:
|
||||
set_required_layouts() : base_pass("set_required_layouts") {}
|
||||
explicit select_preferred_formats(layout_optimizer& lo_ref) :
|
||||
base_pass("select_preferred_formats"), _lo(lo_ref) {}
|
||||
|
||||
private:
|
||||
void run(program& p) override;
|
||||
layout_optimizer& _lo;
|
||||
};
|
||||
|
||||
class trim_to_outputs : public base_pass {
|
||||
|
@ -198,6 +198,9 @@ public:
|
||||
void remove_dependency(size_t idx);
|
||||
void remove_dependency(program_node& node);
|
||||
|
||||
size_t get_dependency_index(program_node& node) const;
|
||||
size_t get_user_index(program_node& node) const;
|
||||
|
||||
std::set<primitive_id> get_memory_dependencies() const;
|
||||
void add_memory_dependency(primitive_id);
|
||||
void add_memory_dependency(std::vector<primitive_id>);
|
||||
@ -419,10 +422,18 @@ public:
|
||||
cur_id = 0;
|
||||
}
|
||||
|
||||
format::type get_required_input0() const { return required_input0; }
|
||||
format::type get_required_output() const { return required_output; }
|
||||
void set_required_input0(format::type type) { required_input0 = type; }
|
||||
void set_required_output(format::type type) { required_output = type; }
|
||||
std::vector<format::type> get_preferred_input_fmts() const { return preferred_input_fmts; }
|
||||
std::vector<format::type> get_preferred_output_fmts() const { return preferred_output_fmts; }
|
||||
format::type get_preferred_input_fmt(size_t idx = 0) const {
|
||||
return (idx < preferred_input_fmts.size()) ? preferred_input_fmts.at(idx) : format::any;
|
||||
}
|
||||
format::type get_preferred_output_fmt(size_t idx = 0) const {
|
||||
return (idx < preferred_output_fmts.size()) ? preferred_output_fmts.at(idx) : format::any;
|
||||
}
|
||||
|
||||
void init_preferred_fmt(size_t dep_size, size_t user_size);
|
||||
void set_preferred_input_fmt(size_t idx, format::type type);
|
||||
void set_preferred_output_fmt(size_t idx, format::type type);
|
||||
|
||||
|
||||
protected:
|
||||
@ -437,8 +448,8 @@ protected:
|
||||
bool valid_output_layout = false;
|
||||
layout output_layout = layout(data_types::f32, format::bfyx, tensor());
|
||||
|
||||
format::type required_input0;
|
||||
format::type required_output;
|
||||
std::vector<format::type> preferred_input_fmts;
|
||||
std::vector<format::type> preferred_output_fmts;
|
||||
|
||||
std::vector<program_node*> dependencies;
|
||||
std::vector<std::pair<program_node*, int>> dependencies_new;
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include <sstream>
|
||||
|
||||
#include "gemm_inst.h"
|
||||
#include "deconvolution_inst.h"
|
||||
#include "eltwise_inst.h"
|
||||
#include "pooling_inst.h"
|
||||
#include "reduce_inst.h"
|
||||
@ -25,10 +26,16 @@
|
||||
#include "depth_to_space_inst.h"
|
||||
#include "region_yolo_inst.h"
|
||||
#include "prior_box_inst.h"
|
||||
#include "to_string_utils.h"
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
#include <oneapi/dnnl/dnnl.hpp>
|
||||
#include "impls/onednn/utils.hpp"
|
||||
#endif
|
||||
|
||||
using namespace cldnn;
|
||||
|
||||
static size_t get_post_ops_count(const program_node& node) {
|
||||
@ -230,8 +237,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
return true;
|
||||
|
||||
// Do not remove reorder if it is necessary to fulfill required_input
|
||||
auto reorder_layout = next.get_dependency(0).get_output_layout();
|
||||
if (reorder_layout.format == next.get_required_input0()
|
||||
auto& reorder_node = next.get_dependency(0);
|
||||
auto reorder_layout = reorder_node.get_output_layout();
|
||||
if (reorder_layout.format == next.get_preferred_input_fmt(next.get_dependency_index(reorder_node))
|
||||
&& !reorder_layout.data_padding)
|
||||
return false;
|
||||
|
||||
@ -332,7 +340,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
|
||||
// Remove Reorder for Convolution if mixed layout.
|
||||
auto& node = prev.get_users().front();
|
||||
if (prev.get_output_layout().format == next.get_required_input0() &&
|
||||
if (prev.get_output_layout().format == next.get_preferred_input_fmt() &&
|
||||
node->get_output_layout().data_padding == prev.get_output_layout().data_padding)
|
||||
return true;
|
||||
|
||||
@ -377,17 +385,18 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node* next, format fmt_prev, format fmt_next) {
|
||||
if (prev.is_dynamic() || (next && next->is_dynamic()))
|
||||
bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node& node, format fmt_prev, format fmt_next) {
|
||||
if (prev.is_dynamic() || (!node.get_users().empty() && node.get_users().front()->is_dynamic()))
|
||||
return false;
|
||||
|
||||
// Ref kernels are the main for depth_to_space, region_yolo and detection_output. It can do anything. Should not see next.
|
||||
if (prev.is_type<depth_to_space>() || prev.is_type<region_yolo>() || prev.is_type<detection_output>())
|
||||
return true;
|
||||
|
||||
if (next == nullptr)
|
||||
if (node.get_users().empty())
|
||||
return false;
|
||||
|
||||
auto next = node.get_users().front();
|
||||
auto dt_prev = prev.get_output_layout().data_type;
|
||||
auto dt_next = next->get_output_layout().data_type;
|
||||
auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
|
||||
@ -436,8 +445,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
|
||||
// Remove Reorder after convolution if possible.
|
||||
if (use_onednn_impls) {
|
||||
auto reorder_layout = next->get_dependency(0).get_output_layout();
|
||||
if (reorder_layout.format == prev.get_required_output() &&
|
||||
auto reorder_layout = node.get_output_layout();
|
||||
if (reorder_layout.format == prev.get_preferred_output_fmt() &&
|
||||
reorder_layout.data_padding == prev.get_output_layout().data_padding)
|
||||
return true;
|
||||
|
||||
@ -1029,7 +1038,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
bool i8_u8_input = input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8;
|
||||
|
||||
if (use_onednn_impls && onednn_valid_post_ops) {
|
||||
expected_format = node.get_required_output();
|
||||
expected_format = node.get_preferred_output_fmt();
|
||||
} else {
|
||||
/* *************************** Native impls format selection part ************************** */
|
||||
if (use_onednn_impls && i8_u8_input) {
|
||||
@ -1123,7 +1132,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
|
||||
if (use_onednn_impls && is_node_for_onednn(node)) {
|
||||
// XXX: need to take the situation into consideration where it is called from prepare_primitive_fusing
|
||||
expected_format = node.get_required_output();
|
||||
expected_format = node.get_preferred_output_fmt();
|
||||
} else if (_optimization_attributes.b_fs_zyx_fsv16_network &&
|
||||
deconvolution_b_fs_zyx_fsv16_opt(current_layout, output_or_weights_layout, prim)) {
|
||||
if ((current_layout.data_type == data_types::f32 && expected_tensor.batch[0] % 16 == 0) ||
|
||||
@ -1633,8 +1642,9 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
};
|
||||
|
||||
if (use_onednn_impls) {
|
||||
if (node.get_users().front()->get_required_input0() != format::any) {
|
||||
expected = node.get_users().front()->get_required_input0();
|
||||
auto& user = node.get_users().front();
|
||||
if (user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) {
|
||||
expected = user->get_preferred_input_fmt(user->get_dependency_index(node));
|
||||
} else {
|
||||
expected = format::any;
|
||||
}
|
||||
@ -1738,6 +1748,53 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
return expected;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
void layout_optimizer::select_preferred_formats_for_onednn(program_node& node, dnnl::primitive_desc prim_desc) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
if (node.is_input() || !are_data_types_suitable_for_onednn(node)) {
|
||||
return;
|
||||
}
|
||||
|
||||
node.init_preferred_fmt(node.get_dependencies().size(), node.get_users().size());
|
||||
if (node.is_type<convolution>() || node.is_type<deconvolution>()) {
|
||||
for (size_t idx = 0 ; idx < node.get_dependencies().size() ; idx++) {
|
||||
if (node.get_dependency(idx).is_constant())
|
||||
continue;
|
||||
|
||||
// Conv or deconv gets a preferred format for its data input based on source memory description
|
||||
// But an input format for fused post-ops should be same with an output format of conv/deconv
|
||||
size_t prim_input;
|
||||
if (node.is_type<convolution>())
|
||||
prim_input = node.get_dependency_index(node.as<convolution>().input());
|
||||
if (node.is_type<deconvolution>())
|
||||
prim_input = node.get_dependency_index(node.as<deconvolution>().input());
|
||||
|
||||
// Note: did not handle attribute properly. especially for zero-point
|
||||
cldnn::format src_fmt = format::any;
|
||||
if (idx == prim_input)
|
||||
src_fmt = onednn::find_data_format(prim_desc.src_desc());
|
||||
else // Dep for fused post ops
|
||||
src_fmt = onednn::find_data_format(prim_desc.dst_desc());
|
||||
|
||||
node.set_preferred_input_fmt(idx, src_fmt);
|
||||
|
||||
auto dst_fmt = onednn::find_data_format(prim_desc.dst_desc());
|
||||
if (node.get_preferred_output_fmt() == format::any) {
|
||||
for (size_t usr = 0 ; usr < node.get_users().size() ; usr++)
|
||||
node.set_preferred_output_fmt(usr, dst_fmt);
|
||||
}
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 2) {
|
||||
std::cout << "select_preferred_formats:" << node.id() << ": " << fmt_to_str(src_fmt) << " --> " << fmt_to_str(dst_fmt)
|
||||
<< " For index : " << idx << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
||||
bool layout_optimizer::all_users_simple_format_until_output(program_node& origin_node, program_node& cur_node, int32_t cur_depth, int32_t max_depth) {
|
||||
if (cur_node.is_output()) return true;
|
||||
if (cur_depth > max_depth) return false;
|
||||
|
@ -543,7 +543,7 @@ void program::pre_optimize_graph(bool is_internal) {
|
||||
|
||||
apply_opt_pass<prepare_primitive_fusing>(lo);
|
||||
|
||||
apply_opt_pass<set_required_layouts>();
|
||||
apply_opt_pass<select_preferred_formats>(lo);
|
||||
|
||||
apply_opt_pass<reorder_inputs>(lo, rf);
|
||||
// Ideally this should be done before fusing to simplify logic and make the pass more powerful,
|
||||
|
@ -31,7 +31,7 @@ using namespace cldnn;
|
||||
thread_local size_t program_node::cur_id = 0;
|
||||
|
||||
program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
|
||||
: desc(prim), myprog(prog), required_input0(format::any), required_output(format::any), org_id(prim ? (prim->id) : 0) {
|
||||
: desc(prim), myprog(prog), preferred_input_fmts({}), preferred_output_fmts({}), org_id(prim ? (prim->id) : 0) {
|
||||
if (prim) {
|
||||
output_layout.data_padding = prim->output_padding;
|
||||
num_outputs = prim->num_outputs;
|
||||
@ -211,6 +211,26 @@ void program_node::remove_dependency(program_node& node) {
|
||||
remove_dependency(i);
|
||||
}
|
||||
|
||||
size_t program_node::get_user_index(program_node& node) const {
|
||||
size_t idx = 0;
|
||||
for (auto& user : users) {
|
||||
if (user == &node)
|
||||
return idx;
|
||||
else
|
||||
idx++;
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(false, "Search invalid user node" + node.id() + " node");
|
||||
}
|
||||
|
||||
size_t program_node::get_dependency_index(program_node& node) const {
|
||||
for (size_t i = 0; i < dependencies.size(); ++i)
|
||||
if (dependencies[i] == &node)
|
||||
return i;
|
||||
|
||||
OPENVINO_ASSERT(false, "Search invalid dependency node" + node.id() + " node");
|
||||
}
|
||||
|
||||
bool program_node::is_detached(bool whole_branch) {
|
||||
if (!users.empty())
|
||||
return false;
|
||||
@ -353,7 +373,7 @@ std::map<size_t, memory::ptr> program_node::get_const_memory_deps() const {
|
||||
void program_node::invalidate_users() const {
|
||||
for (auto& user : users) {
|
||||
if (user->valid_output_layout) {
|
||||
if (user->get_required_output() != format::any)
|
||||
if (user->get_preferred_output_fmt() != format::any)
|
||||
continue;
|
||||
user->valid_output_layout = false;
|
||||
user->invalidate_users();
|
||||
@ -407,6 +427,25 @@ bool program_node::need_lockable_memory() const {
|
||||
return need_lockable_mem;
|
||||
}
|
||||
|
||||
void program_node::init_preferred_fmt(size_t dep_node, size_t user_node) {
|
||||
preferred_input_fmts.resize(dep_node, format::any);
|
||||
preferred_output_fmts.resize(user_node, format::any);
|
||||
}
|
||||
|
||||
void program_node::set_preferred_input_fmt(size_t idx, format::type type) {
|
||||
if (idx >= preferred_input_fmts.size())
|
||||
preferred_input_fmts.resize(idx+1, format::any);
|
||||
|
||||
preferred_input_fmts.at(idx) = type;
|
||||
}
|
||||
|
||||
void program_node::set_preferred_output_fmt(size_t idx, format::type type) {
|
||||
if (idx >= preferred_output_fmts.size())
|
||||
preferred_output_fmts.resize(idx+1, format::any);
|
||||
|
||||
preferred_output_fmts.at(idx) = type;
|
||||
}
|
||||
|
||||
/* ----------------------------------------- */
|
||||
/* Onednn fused operations integration logic */
|
||||
/* ----------------------------------------- */
|
||||
@ -1254,4 +1293,5 @@ void program_node::init_onednn_primitive_attributes() {
|
||||
add_onednn_attrs(attrs);
|
||||
}
|
||||
|
||||
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
@ -121,7 +121,8 @@ static void print_help_messages() {
|
||||
" Supported on only on linux.");
|
||||
message_list.emplace_back("OV_GPU_SerialCompile", "Serialize creating primitives and compiling kernels");
|
||||
message_list.emplace_back("OV_GPU_ForceImplType", "Force implementation type of a target primitive or layer. [primitive or layout_name]:[impl_type]"
|
||||
"For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:ocl and reduce:onednn are supported");
|
||||
" For primitives, fc:onednn, fc:ocl, do:cpu, do:ocl, reduce:onednn, reduce:ocl, concat:onednn,"
|
||||
" and concat:ocl are supported");
|
||||
message_list.emplace_back("OV_GPU_MaxKernelsPerBatch", "Maximum number of kernels in a batch during compiling kernels");
|
||||
|
||||
auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
|
||||
|
@ -0,0 +1,65 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
|
||||
#include "intel_gpu/primitives/convolution.hpp"
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
#include "data_inst.h"
|
||||
#include "convolution_inst.h"
|
||||
#include "intel_gpu/graph/network.hpp"
|
||||
#include "pass_manager.h"
|
||||
#include "to_string_utils.h"
|
||||
|
||||
#include "program_wrapper.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
using namespace testing;
|
||||
|
||||
TEST(test_select_preferred_formats, setting_target_conv_format) {
|
||||
auto& engine = get_test_engine();
|
||||
auto input = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 32, 64, 64 } });
|
||||
auto weights = engine.allocate_memory({ data_types::f16, format::bfyx, { 1, 32, 64, 64 } });
|
||||
|
||||
topology topology;
|
||||
topology.add(data("weights", weights));
|
||||
topology.add(input_layout("input", input->get_layout()));
|
||||
topology.add(reorder("reorder", "input", format::b_fs_yx_fsv16, data_types::f16)),
|
||||
topology.add(convolution("conv1", "reorder", { "weights" }));
|
||||
|
||||
build_options build;
|
||||
build.set_option(build_option::allow_new_shape_infer(true));
|
||||
implementation_desc impl = { format::b_fs_yx_fsv16, std::string(""), impl_types::onednn };
|
||||
build.set_option(build_option::force_implementations({ {"conv1", impl} }));
|
||||
|
||||
layout_optimizer lo(true);
|
||||
auto prog = program::build_program(engine, topology, build, false, true);
|
||||
|
||||
program_wrapper::apply_opt_pass<select_preferred_formats>(*prog, lo);
|
||||
|
||||
ASSERT_NE(prog, nullptr);
|
||||
|
||||
auto itr = prog->get_processing_order().begin();
|
||||
while (itr != prog->get_processing_order().end()) {
|
||||
auto node_ptr = *itr++;
|
||||
if (!node_ptr->is_type<convolution>())
|
||||
continue;
|
||||
|
||||
auto& node = node_ptr->as<convolution>();
|
||||
auto input_fmt = node.get_preferred_input_fmt(0);
|
||||
auto output_fmt = node.get_preferred_output_fmt(0);
|
||||
if (engine.get_device_info().supports_immad) {
|
||||
ASSERT_EQ(input_fmt, format::b_fs_yx_fsv16);
|
||||
ASSERT_EQ(output_fmt, format::b_fs_yx_fsv16);
|
||||
} else {
|
||||
ASSERT_EQ(input_fmt, format::any);
|
||||
ASSERT_EQ(output_fmt, format::any);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user