[GPU] Insert reorder for eltwise const dependency (#14013)

* [GPU] Insert reorder for eltwise const dependency

* [GPU] Fixes for const reorder logic. Updated one codegnerator config
This commit is contained in:
Vladimir Paramuzov 2022-11-22 09:10:00 +04:00 committed by GitHub
parent 6b4b01aecf
commit dfb31f485b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 5 deletions

View File

@ -44,6 +44,7 @@ void add_required_reorders::add_reorder(program& p, program_node* node, program_
}
void add_required_reorders::run(program& p) {
bool optimize_data = p.get_options().get<build_option_type::optimize_data>()->enabled();
auto usr_itr = p.get_processing_order().begin();
while (usr_itr != p.get_processing_order().end()) {
auto& usr = *usr_itr++;
@ -51,6 +52,46 @@ void add_required_reorders::run(program& p) {
continue; // only nodes with dependencies
if (usr->is_type<data>())
continue;
if (optimize_data) {
auto fused_ops = usr->get_fused_primitives();
auto out_layout = usr->get_output_layout();
// If there is a fused reorder at the end, then we use input layout of reorder
// as target one for fused ops, as code generator in many kernels is expecting that, not final output layout
// However, the condition below may need some adjustment in the future, if codegen of some primitives behave differently
if (!fused_ops.empty() && fused_ops.back().is_type<reorder>()) {
out_layout = fused_ops.back().input_layout;
}
for (auto& fused_op : fused_ops) {
// Some kernels use blocked aligned subgroup reads for a vector of elements from dependency tensor
// In that case jitter checks that layout of input tensor from fused op is same as output layout or broadcast is possible
// The code below is intended to insert additional reorder node for const eltwise dependency to ensure jitter can process such fusion
if (!fused_op.is_type<eltwise>() && !(fused_op.is_type<activation>() && fused_op.total_num_deps == 2))
continue;
auto dep_id = fused_op.dep_start_idx;
if (dep_id >= usr->get_dependencies().size())
continue;
auto& dep = usr->get_dependency(dep_id);
if (!dep.is_type<data>())
continue;
auto dep_layout = dep.get_output_layout();
bool valid_broadcast_case = out_layout.is_static() && dep_layout.is_static() &&
(static_cast<size_t>(out_layout.feature()) == dep_layout.count() || dep_layout.count() == 1);
bool requires_reorder = out_layout.format != dep_layout.format && !valid_broadcast_case;
if (requires_reorder) {
auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, dep_layout.data_type);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, dep);
new_reorder_node.recalc_output_layout(false);
}
}
}
if (usr->type()->does_an_implementation_exist(*usr)) {
if (usr->get_preferred_impl_type() != impl_types::onednn) {
continue;

View File

@ -267,7 +267,16 @@ JitConstants DeconvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const deconvolu
BoundaryCheck::ENABLED,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::BATCH };
FusedOpsConfiguration conf_ci = { "_BLOCK_CI", idx_order_block_ci, "blockC00[i]", fused_dt, 1, LoadType::LT_ALIGNED_READ };
auto load_type = LoadType::LT_ALIGNED_READ;
for (auto& fused_op : params.fused_ops) {
if (!fused_op.output_tensor.SameDims(params.outputs[0]) &&
(fused_op.output_tensor.X().v > 1 || fused_op.output_tensor.Y().v > 1 || fused_op.output_tensor.Z().v > 1)) {
load_type = LoadType::LT_UNALIGNED;
idx_order_block_ci[1] = "(g * IC + gic * IC_BLOCK + local_id)";
}
}
FusedOpsConfiguration conf_ci = { "_BLOCK_CI", idx_order_block_ci, "blockC00[i]", fused_dt, 1, load_type };
jit.Merge(MakeFusedOpsJitConstants(params, { conf_c00, conf_c01, conf_ci }));
}

View File

@ -9,12 +9,8 @@
#include <intel_gpu/primitives/quantize.hpp>
#include <intel_gpu/primitives/eltwise.hpp>
#include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/gemm.hpp>
#include <intel_gpu/primitives/binary_convolution.hpp>
#include <intel_gpu/primitives/data.hpp>
#include <intel_gpu/primitives/resample.hpp>
#include <intel_gpu/primitives/crop.hpp>
#include <intel_gpu/primitives/mvn.hpp>
#include <intel_gpu/primitives/permute.hpp>
#include <intel_gpu/primitives/concatenation.hpp>
@ -590,6 +586,35 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_wrong_bias, ::testing::ValuesIn(
convolution_test_params{ CASE_CONV_FP32_15, 3, 3 },
}));
class conv_fp32_add_per_element_planar_const : public ConvFusingTest {};
TEST_P(conv_fp32_add_per_element_planar_const, basic) {
auto p = GetParam();
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16" };
implementation_desc permute_impl = { format::b_fs_yx_fsv16, "" };
bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl },
{ "permute", permute_impl } }));
auto out_layout = get_output_layout(p);
out_layout.format = format::bfyx;
create_topologies(
input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("data", get_mem(out_layout)),
convolution("conv_prim", "input", { "weights" }, std::vector<primitive_id>{}, p.groups, p.stride, p.pad, p.dilation),
eltwise("add", { "conv_prim", "data" }, eltwise_mode::sum),
permute("permute", "add", {3, 2, 1, 0}),
reorder("reorder_bfyx", "permute", p.default_format, data_types::f32)
);
tolerance = default_tolerance(p.default_type);
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_add_per_element_planar_const, ::testing::ValuesIn(std::vector<convolution_test_params>{
convolution_test_params{ CASE_CONV_FP32_3, 3, 4 },
}));
class conv_fp32_prelu_eltwise : public ConvFusingTest {};
TEST_P(conv_fp32_prelu_eltwise, basic_sum) {
auto p = GetParam();