[GPU] Insert reorder for eltwise const dependency (#14013)
* [GPU] Insert reorder for eltwise const dependency * [GPU] Fixes for const reorder logic. Updated one codegnerator config
This commit is contained in:
parent
6b4b01aecf
commit
dfb31f485b
@ -44,6 +44,7 @@ void add_required_reorders::add_reorder(program& p, program_node* node, program_
|
||||
}
|
||||
|
||||
void add_required_reorders::run(program& p) {
|
||||
bool optimize_data = p.get_options().get<build_option_type::optimize_data>()->enabled();
|
||||
auto usr_itr = p.get_processing_order().begin();
|
||||
while (usr_itr != p.get_processing_order().end()) {
|
||||
auto& usr = *usr_itr++;
|
||||
@ -51,6 +52,46 @@ void add_required_reorders::run(program& p) {
|
||||
continue; // only nodes with dependencies
|
||||
if (usr->is_type<data>())
|
||||
continue;
|
||||
|
||||
if (optimize_data) {
|
||||
auto fused_ops = usr->get_fused_primitives();
|
||||
auto out_layout = usr->get_output_layout();
|
||||
// If there is a fused reorder at the end, then we use input layout of reorder
|
||||
// as target one for fused ops, as code generator in many kernels is expecting that, not final output layout
|
||||
// However, the condition below may need some adjustment in the future, if codegen of some primitives behave differently
|
||||
if (!fused_ops.empty() && fused_ops.back().is_type<reorder>()) {
|
||||
out_layout = fused_ops.back().input_layout;
|
||||
}
|
||||
for (auto& fused_op : fused_ops) {
|
||||
// Some kernels use blocked aligned subgroup reads for a vector of elements from dependency tensor
|
||||
// In that case jitter checks that layout of input tensor from fused op is same as output layout or broadcast is possible
|
||||
// The code below is intended to insert additional reorder node for const eltwise dependency to ensure jitter can process such fusion
|
||||
if (!fused_op.is_type<eltwise>() && !(fused_op.is_type<activation>() && fused_op.total_num_deps == 2))
|
||||
continue;
|
||||
|
||||
auto dep_id = fused_op.dep_start_idx;
|
||||
if (dep_id >= usr->get_dependencies().size())
|
||||
continue;
|
||||
|
||||
auto& dep = usr->get_dependency(dep_id);
|
||||
if (!dep.is_type<data>())
|
||||
continue;
|
||||
|
||||
auto dep_layout = dep.get_output_layout();
|
||||
|
||||
bool valid_broadcast_case = out_layout.is_static() && dep_layout.is_static() &&
|
||||
(static_cast<size_t>(out_layout.feature()) == dep_layout.count() || dep_layout.count() == 1);
|
||||
|
||||
bool requires_reorder = out_layout.format != dep_layout.format && !valid_broadcast_case;
|
||||
if (requires_reorder) {
|
||||
auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, dep_layout.data_type);
|
||||
auto& new_reorder_node = p.get_or_create(new_reorder);
|
||||
p.add_intermediate(new_reorder_node, *usr, dep);
|
||||
new_reorder_node.recalc_output_layout(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (usr->type()->does_an_implementation_exist(*usr)) {
|
||||
if (usr->get_preferred_impl_type() != impl_types::onednn) {
|
||||
continue;
|
||||
|
@ -267,7 +267,16 @@ JitConstants DeconvolutionKernel_b_fs_zyx_fsv16::GetJitConstants(const deconvolu
|
||||
BoundaryCheck::ENABLED,
|
||||
IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName::BATCH };
|
||||
FusedOpsConfiguration conf_ci = { "_BLOCK_CI", idx_order_block_ci, "blockC00[i]", fused_dt, 1, LoadType::LT_ALIGNED_READ };
|
||||
|
||||
auto load_type = LoadType::LT_ALIGNED_READ;
|
||||
for (auto& fused_op : params.fused_ops) {
|
||||
if (!fused_op.output_tensor.SameDims(params.outputs[0]) &&
|
||||
(fused_op.output_tensor.X().v > 1 || fused_op.output_tensor.Y().v > 1 || fused_op.output_tensor.Z().v > 1)) {
|
||||
load_type = LoadType::LT_UNALIGNED;
|
||||
idx_order_block_ci[1] = "(g * IC + gic * IC_BLOCK + local_id)";
|
||||
}
|
||||
}
|
||||
FusedOpsConfiguration conf_ci = { "_BLOCK_CI", idx_order_block_ci, "blockC00[i]", fused_dt, 1, load_type };
|
||||
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf_c00, conf_c01, conf_ci }));
|
||||
}
|
||||
|
@ -9,12 +9,8 @@
|
||||
#include <intel_gpu/primitives/quantize.hpp>
|
||||
#include <intel_gpu/primitives/eltwise.hpp>
|
||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||
#include <intel_gpu/primitives/gemm.hpp>
|
||||
#include <intel_gpu/primitives/binary_convolution.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
#include <intel_gpu/primitives/resample.hpp>
|
||||
#include <intel_gpu/primitives/crop.hpp>
|
||||
#include <intel_gpu/primitives/mvn.hpp>
|
||||
#include <intel_gpu/primitives/permute.hpp>
|
||||
#include <intel_gpu/primitives/concatenation.hpp>
|
||||
|
||||
@ -590,6 +586,35 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_wrong_bias, ::testing::ValuesIn(
|
||||
convolution_test_params{ CASE_CONV_FP32_15, 3, 3 },
|
||||
}));
|
||||
|
||||
class conv_fp32_add_per_element_planar_const : public ConvFusingTest {};
|
||||
TEST_P(conv_fp32_add_per_element_planar_const, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16" };
|
||||
implementation_desc permute_impl = { format::b_fs_yx_fsv16, "" };
|
||||
bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl },
|
||||
{ "permute", permute_impl } }));
|
||||
|
||||
auto out_layout = get_output_layout(p);
|
||||
out_layout.format = format::bfyx;
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("data", get_mem(out_layout)),
|
||||
convolution("conv_prim", "input", { "weights" }, std::vector<primitive_id>{}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("add", { "conv_prim", "data" }, eltwise_mode::sum),
|
||||
permute("permute", "add", {3, 2, 1, 0}),
|
||||
reorder("reorder_bfyx", "permute", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
tolerance = default_tolerance(p.default_type);
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_add_per_element_planar_const, ::testing::ValuesIn(std::vector<convolution_test_params>{
|
||||
convolution_test_params{ CASE_CONV_FP32_3, 3, 4 },
|
||||
}));
|
||||
|
||||
class conv_fp32_prelu_eltwise : public ConvFusingTest {};
|
||||
TEST_P(conv_fp32_prelu_eltwise, basic_sum) {
|
||||
auto p = GetParam();
|
||||
|
Loading…
Reference in New Issue
Block a user