diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index aef93029ce1..61e817c854c 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -187,12 +187,20 @@ void prepare_primitive_fusing::fuse_reorders(program &p) { void prepare_primitive_fusing::fuse_activations(program &p) { bool is_debug = p.get_options().get()->enabled(); std::map> fusing_history; + bool use_onednn_impls = false; + +#ifdef ENABLE_ONEDNN_FOR_GPU + auto& engine = p.get_engine(); + if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order) + use_onednn_impls = true; +#endif + auto itr = p.get_processing_order().begin(); while (itr != p.get_processing_order().end()) { auto node_itr = itr++; auto& node = (*node_itr); - program_helpers::do_for_types(*node, [&p, &is_debug, &fusing_history](activation_node& node) { + program_helpers::do_for_types(*node, [&p, &is_debug, &fusing_history, &use_onednn_impls](activation_node& node) { auto& input = node.input(); auto id = node.id(); // Restrictions: @@ -239,6 +247,9 @@ void prepare_primitive_fusing::fuse_activations(program &p) { return; } + if (input.is_type() && use_onednn_impls) + return; + if (input.get_fused_primitives().empty()) { input.add_fused_activation(node.get_primitive()->activation_function, node.get_primitive()->additional_params); for (size_t i = 0; i < node.get_fused_activations_funcs().size(); i++) { diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp index 2737572aa0a..07a9f63a9a0 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp @@ -56,7 +56,12 @@ void remove_redundant_reorders::run(program& p) { // Avoid different data types between input and output auto same_data_type = input.get_output_layout().data_type == output_layout.data_type; - if (!same_data_type) + auto i8_u8_input = input.get_output_layout().data_type == data_types::i8 || + input.get_output_layout().data_type == data_types::u8; + auto quantize_user = node.get_users().front()->is_type() && + node.get_users().size() == 1; + + if (!same_data_type && !(i8_u8_input && quantize_user)) continue; // Avoid optimization of nv12 reorder @@ -323,7 +328,7 @@ void remove_redundant_reorders::run(program& p) { auto& dep = node_ptr->get_dependency(0); if (!usr->is_type() || (dep.get_output_layout().format != format::b_fs_yx_fsv16 && - dep.get_output_layout().format != format::fs_b_yx_fsv32 && + (lo.get_optimization_attributes().use_onednn_impls || dep.get_output_layout().format != format::fs_b_yx_fsv32) && dep.get_output_layout().format != format::bfyx)) continue; @@ -348,7 +353,7 @@ void remove_redundant_reorders::run(program& p) { auto& dep = node->get_dependency(0); if (!(usr->is_type()) || - usr->get_output_layout().data_type != dep.get_output_layout().data_type || + node->get_output_layout().data_type != dep.get_output_layout().data_type || dep.get_output_layout().format != format::bfyx) return false; if (usr->as().get_preferred_impl_type() != impl_types::onednn && diff --git a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp index 2bf19225e76..34f3c1498cf 100644 --- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp +++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp @@ -178,6 +178,8 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, auto prev_output_layout = prev.get_output_layout(); auto next_output_layout = next.get_output_layout(); auto prev_dt = prev.get_output_layout().data_type; + auto next_dt = next.get_output_layout().data_type; + auto use_onednn_impls = _optimization_attributes.use_onednn_impls; auto is_input_idx = [&](size_t idx) -> bool { if (&next.get_dependency(idx) == &prev) @@ -191,7 +193,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, return true; if (next.is_type() && - ((prev_simple && next_simple) || + (((prev_simple && next_simple) && (prev_dt == next_dt)) || ((fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::bfyx) && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))) return true; @@ -223,9 +225,12 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, // Additional check: fmt_prev == fmt_next is added only when onednn is enabled. if (next.is_type() && - fmt_prev == format::bfyx && (!get_optimization_attributes().use_onednn_impls || fmt_prev == fmt_next) && + (!use_onednn_impls || fmt_prev == fmt_next) && + (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) && ((fmt_next == format::fs_b_yx_fsv32 && next.as().get_primitive()->groups == 1) || (fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || + (fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || + (fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) || (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 && (next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8)))) @@ -239,8 +244,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, return true; if (next.is_type() && - fmt_prev == format::b_fs_yx_fsv4 && + (fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bs_fs_yx_bsv4_fsv4) && ((fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || + (fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || + (fmt_next == format::bs_fs_yx_bsv4_fsv4 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))))) return true; @@ -263,12 +270,51 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3)) return true; + if (use_onednn_impls) { + if (next.is_type() && (fmt_prev == format::bfyx) && (fmt_next == format::bs_fs_yx_bsv4_fsv2) && + prev.is_input() && (prev_dt == data_types::u8 || prev_dt == data_types::i8)) + return true; + + if (next.is_type() && (prev.is_type() || prev.is_type()) && + (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) && + ((fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || + (fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)))) + return true; + + if (next.is_type()) + return true; + + if (next.is_type()) { + auto is_rotating_except_batch = [](const std::vector& order) { + // Target transform: Rotate feature dim to back to be taken as inner-most axis + // ex) 0(b), 4(f), 1(z), 2(y), 3(x) + // ex) 0(b), 3(f), 1(y), 2(x) + if ((int32_t) order[1] != order.size() - 1) return false; + if ((int32_t) order[0] != 0) return false; + for (int32_t i = 2; i < (int32_t) order.size(); ++i) { + if ((int32_t)order[i] != (i - 1)) return false; + } + return true; + }; + + auto& permute_order = next.as().get_primitive()->permute_order; + if ((fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::b_fs_yx_fsv32 || fmt_prev == format::b_fs_zyx_fsv32 || + fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_zyx_fsv16 || fmt_prev == format::bs_fs_yx_bsv16_fsv16) + && permute_order[1] == 2 + && (!is_rotating_except_batch(permute_order))) { + return false; + } + return true; + } + } + return false; } bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node& next, format fmt_prev, format fmt_next) { auto dt_prev = prev.get_output_layout().data_type; auto dt_next = next.get_output_layout().data_type; + auto use_onednn_impls = _optimization_attributes.use_onednn_impls; if (prev.is_type()) return true; @@ -283,7 +329,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node return true; if (prev.is_type() && - (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 || + (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_zyx_fsv32 || (fmt_next == format::b_fs_yx_fsv32 && !use_onednn_impls) || fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_zyx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16)) return true; @@ -310,6 +356,11 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node return true; } + if (use_onednn_impls) { + if (prev.is_type() && fmt_next == format::bs_fs_yx_bsv32_fsv16 && fmt_prev == format::bs_fs_yx_bsv4_fsv2) + return true; + } + return false; } @@ -1270,17 +1321,60 @@ format layout_optimizer::get_preferred_format(program_node& node) { node.as(), layout{ data_types::f32, format::bfyx, tensor{} }).format; } else if (node.is_type()) { + bool use_onednn_impls = _optimization_attributes.use_onednn_impls; auto layout = node.get_output_layout(); if (layout.format.spatial_num() == 2 && (layout.data_type == data_types::i8 || layout.data_type == data_types::u8) && layout.size.batch[0] % 16 == 0) { - if (layout.size.feature[0] > 8) { + if (use_onednn_impls && layout.size.batch[0] % 32 == 0) { + if (node.get_users().size() == 1 && node.get_users().front()->is_type()) { + auto& conv = node.get_users().front()->as(); + auto ws = conv.get_dependency(1).get_output_layout().size; + if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1) + expected = format::bfyx; + else + expected = format::bs_fs_yx_bsv4_fsv4; + + auto conv_output_layout = conv.get_output_layout(); + auto weights_layout = conv.weights(0).get_output_layout(); + format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format; + if (expected == format::bfyx && expected_conv_fmt == format::bs_fs_yx_bsv32_fsv32 && + layout.size.feature[0] % 32 == 0) { + expected = expected_conv_fmt; + } + } else { + expected = format::bfyx; + } + } else if (layout.size.feature[0] > 8) { expected = format::b_fs_yx_fsv16; } else { - expected = format::b_fs_yx_fsv4; + if (use_onednn_impls && layout.size.batch[0] % 16 == 0 && node.get_users().front()->is_type()) { + auto& conv = node.get_users().front()->as(); + auto ws = conv.get_dependency(1).get_output_layout().size; + if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1) + expected = format::bfyx; + else + expected = format::bs_fs_yx_bsv4_fsv4; + } else { + expected = format::b_fs_yx_fsv4; + } } } else if (layout.format.spatial_num() == 3 && (layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) { expected = format::b_fs_zyx_fsv16; + } else if (use_onednn_impls) { + if (node.get_users().size() == 1 && node.get_users().front()->is_type() && + (layout.data_type == data_types::i8 || layout.data_type == data_types::u8)) { + auto& conv = node.get_users().front()->as(); + auto conv_output_layout = conv.get_output_layout(); + auto weights_layout = conv.weights(0).get_output_layout(); + format expected_conv_fmt = get_expected_layout(conv_output_layout, conv, weights_layout).format; + if (layout.format.spatial_num() == 2 && layout.size.feature[0] % 32 == 0 && expected_conv_fmt == format::b_fs_yx_fsv32) + expected = expected_conv_fmt; + else + expected = format::bfyx; + } else { + expected = format::bfyx; + } } } else if (node.is_type() || node.is_type()) { expected = node.get_output_layout().format;