[GPU] Extended SupportedFusedOps for Concatenation (#20096)

This commit is contained in:
Roman Lyamin 2023-10-09 14:28:37 +04:00 committed by GitHub
parent ba6a676484
commit 99de7818be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 230 additions and 19 deletions

View File

@ -6,7 +6,13 @@
#define GET_INDEX(prefix, ORDER) CAT(prefix, _GET_INDEX)(ORDER)
KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
KERNEL(concatenation_gpu_ref)(__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
uint output_offset_in_concat_axis
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const uint d1 = (uint)get_global_id(0); // Y
const uint d2 = (uint)get_global_id(1); // F
@ -20,6 +26,14 @@ KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE
{
uint input_offset = GET_INDEX(INPUT0, INPUT_DIMS_ORDER);
uint output_offset = GET_INDEX(OUTPUT, OUTPUT_DIMS_ORDER);
output[output_offset] = ACTIVATION(TO_OUTPUT_TYPE(input[input_offset]), ACTIVATION_PARAMS);
INPUT0_TYPE result = input[input_offset];
#if HAS_FUSED_OPS
FUSED_OPS;
output[output_offset] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT);
#else
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(result, ACTIVATION_PARAMS));
#endif
}
}

View File

@ -8,7 +8,11 @@ KERNEL (concatenation_gpu_ref)(
OPTIONAL_SHAPE_INFO_ARG
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
uint output_offset_in_concat_axis)
uint output_offset_in_concat_axis
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const uint x = (uint)get_global_id(0) % INPUT0_SIZE_X;
const uint y = (uint)get_global_id(0) / INPUT0_SIZE_X;
@ -43,5 +47,12 @@ KERNEL (concatenation_gpu_ref)(
uint input_offset = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
uint output_offset = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_w, out_z, out_y, out_x);
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(input[input_offset], ACTIVATION_PARAMS));
INPUT0_TYPE result = input[input_offset];
#if HAS_FUSED_OPS
FUSED_OPS;
output[output_offset] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT);
#else
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(result, ACTIVATION_PARAMS));
#endif
}

View File

@ -83,13 +83,16 @@ JitConstants ConcatenationKernelRef::GetJitConstants(const concatenation_params&
std::string input_dims_order = "";
std::string output_dims_order = "";
for (size_t i = 0; i < dims_id.size(); i++) {
input_dims_order += dims_id[i] + (i == dims_id.size() - 1 ? "" : ",");
if (axis_order[i] == axis)
output_dims_order += "(" + dims_id[i] + " + output_offset_in_concat_axis)" +
(i == dims_id.size() - 1 ? "" : ",");
else
output_dims_order += dims_id[i] + (i == dims_id.size() - 1 ? "" : ",");
for (size_t i = 0; i < dims_id.size(); ++i) {
std::string separator = i == dims_id.size() - 1 ? "" : ",";
input_dims_order += dims_id[i] + separator;
if (axis_order[i] == axis) {
output_dims_order += "(" + dims_id[i] + " + output_offset_in_concat_axis)" + separator;
} else {
output_dims_order += dims_id[i] + separator;
}
}
cldnnJit.AddConstant(MakeJitConstant("INPUT_DIMS_ORDER", input_dims_order));
@ -97,6 +100,14 @@ JitConstants ConcatenationKernelRef::GetJitConstants(const concatenation_params&
cldnnJit.AddConstant(MakeJitConstant("INPUT_DIM_0", DataTensor::Channelndex(input_format, Tensor::DataChannelName::X)));
if (!params.fused_ops.empty()) {
auto idx_order = dims_id;
size_t axis_idx = std::distance(axis_order.begin(), std::find(axis_order.begin(), axis_order.end(), axis));
idx_order[axis_idx] = "(" + idx_order[axis_idx] + " + output_offset_in_concat_axis)";
auto conf = FusedOpsConfiguration("", idx_order, "result", params.inputs[0].GetDType());
cldnnJit.Merge(MakeFusedOpsJitConstants(params, { conf }));
}
return cldnnJit;
}

View File

@ -21,7 +21,10 @@ protected:
JitConstants GetJitConstants(const concatenation_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::REORDER
FusedOpType::REORDER,
FusedOpType::ACTIVATION,
FusedOpType::ELTWISE,
FusedOpType::QUANTIZE
};
}
};

View File

@ -99,6 +99,26 @@ ConcatenationKernelBase::DispatchData ConcatenationKernel_simple_Ref::SetDefault
return dispatchData;
}
JitConstants ConcatenationKernel_simple_Ref::GetJitConstants(const concatenation_params& params) const {
auto jit = ConcatenationKernelBase::GetJitConstants(params);
if (!params.fused_ops.empty()) {
const auto& output = params.outputs[0];
std::vector<std::string> idx_order;
if (output.Dimentions() == 6) {
idx_order = { "out_b", "out_f", "out_w", "out_z", "out_y", "out_x" };
} else if (output.Dimentions() == 5) {
idx_order = { "out_b", "out_f", "out_z", "out_y", "out_x" };
} else {
idx_order = { "out_b", "out_f", "out_y", "out_x" };
}
auto conf = FusedOpsConfiguration("", idx_order, "result", params.inputs[0].GetDType());
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
}
return jit;
}
KernelsData ConcatenationKernel_simple_Ref::GetKernelsData(const Params& params, const optional_params& optParams) const {
KernelsData kd = GetCommonKernelsData(params, optParams);
return kd;

View File

@ -15,6 +15,7 @@ public:
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
JitConstants GetJitConstants(const concatenation_params& params) const override;
DispatchData SetDefault(const concatenation_params& params) const override;
bool Validate(const Params& p, const optional_params& o) const override;
@ -22,7 +23,10 @@ protected:
ParamsKey GetSupportedKey() const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return {
FusedOpType::REORDER
FusedOpType::REORDER,
FusedOpType::ACTIVATION,
FusedOpType::ELTWISE,
FusedOpType::QUANTIZE
};
}
};

View File

@ -15,10 +15,8 @@
using namespace cldnn;
using namespace ::tests;
#ifdef ENABLE_ONEDNN_FOR_GPU
namespace {
struct concat_test_params {
tensor in_shape;
ov::PartialShape in_shape;
data_types data_type;
format input_format;
data_types default_type;
@ -28,6 +26,8 @@ struct concat_test_params {
std::string kernel_name;
};
#ifdef ENABLE_ONEDNN_FOR_GPU
namespace {
class ConcatOneDNNFusingTest : public ::BaseFusingTest<concat_test_params> {
public:
void execute(concat_test_params& p) {
@ -79,11 +79,11 @@ public:
}
layout get_input_layout(concat_test_params& p) {
return layout{ p.data_type, p.input_format, p.in_shape };
return layout{ p.in_shape, p.data_type, p.input_format };
}
layout get_per_channel_layout(concat_test_params& p) {
return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } };
return layout{ { 1, p.in_shape[1] }, p.default_type, p.default_format };
}
};
} // namespace
@ -116,7 +116,7 @@ TEST_P(concat_onednn_activation, along_f) {
class concat_onednn_eltwise : public ConcatOneDNNFusingTest {};
TEST_P(concat_onednn_eltwise, along_f) {
auto p = GetParam();
layout data_layout(p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0]*2, 1, 1 });
layout data_layout({ 1, p.in_shape[1] * 2 }, p.default_type, p.default_format);
create_topologies(
input_layout("input0", get_input_layout(p)),
@ -144,3 +144,151 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_onednn_eltwise, ::testing::ValuesIn
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
}));
#endif
namespace {
class ConcatFusingTest : public ::BaseFusingTest<concat_test_params> {
public:
void execute(concat_test_params& p) {
auto input0_prim = get_mem(get_input_layout(p));
auto input1_prim = get_mem(get_input_layout(p));
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
network network_fused(this->engine, this->topology_fused, cfg_fused);
network_not_fused.set_input_data("input0", input0_prim);
network_not_fused.set_input_data("input1", input1_prim);
network_fused.set_input_data("input0", input0_prim);
network_fused.set_input_data("input1", input1_prim);
ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
ASSERT_FALSE(network_fused.get_primitives_info().empty());
auto find_and_check = [&](primitive_info& p) -> bool {
return p.original_id == "concat" || p.original_id == "reorder_bfyx";
};
auto pi_fused = network_fused.get_primitives_info();
auto pi_not_fused = network_not_fused.get_primitives_info();
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_and_check);
auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_and_check);
ASSERT_TRUE(info_fused != pi_fused.end());
ASSERT_TRUE(info_not_fused != pi_not_fused.end());
compare(network_not_fused, network_fused, p);
}
layout get_input_layout(concat_test_params& p) {
return layout{ p.in_shape, p.data_type, p.input_format };
}
layout get_per_channel_layout(concat_test_params& p) {
return layout{ { 1, p.in_shape[1] }, p.default_type, p.default_format };
}
};
} // namespace
/* ----------------------------------------------------------------------------------------------------- */
/* --------------------------------------- Concat cases ------------------------------------------------ */
/* ----------------------------------------------------------------------------------------------------- */
#define CASE_CONCAT_F32_1 { 1, 8, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
#define CASE_CONCAT_F16_1 { 1, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx
class concat_activation : public ConcatFusingTest {};
TEST_P(concat_activation, along_f) {
auto p = GetParam();
if (engine.get_device_info().supports_immad)
p.expected_fused_primitives++;
create_topologies(
input_layout("input0", get_input_layout(p)),
input_layout("input1", get_input_layout(p)),
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
activation("act1", input_info("concat"), activation_func::round_half_to_even),
activation("act2", input_info("act1"), activation_func::clamp, { -0.5f, 0.5f }),
reorder("reorder_bfyx", input_info("act2"), cldnn::format::bfyx, p.default_type)
);
tolerance = default_tolerance(p.data_type);
execute(p);
}
class concat_eltwise_with_broadcast : public ConcatFusingTest {};
TEST_P(concat_eltwise_with_broadcast, along_f) {
auto p = GetParam();
layout data_layout({ 1, p.in_shape[1] * 2 }, p.default_type, p.default_format);
create_topologies(
input_layout("input0", get_input_layout(p)),
input_layout("input1", get_input_layout(p)),
data("scale_data", get_mem(data_layout, 1.0f / tensor{ 1, 1, 4, 4 }.count())),
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
eltwise("scale", { input_info("concat"), input_info("scale_data") }, eltwise_mode::prod, p.default_type),
reorder("reorder_bfyx", input_info("scale"), cldnn::format::bfyx, p.default_type)
);
tolerance = default_tolerance(p.data_type);
execute(p);
}
class concat_eltwise_wo_broadcast : public ConcatFusingTest {};
TEST_P(concat_eltwise_wo_broadcast, along_f) {
auto p = GetParam();
ov::PartialShape concatenated_shape = p.in_shape;
concatenated_shape[1] *= 2;
layout data_layout(concatenated_shape, p.default_type, p.default_format);
create_topologies(
input_layout("input0", get_input_layout(p)),
input_layout("input1", get_input_layout(p)),
data("scale_data", get_mem(data_layout, 1.0f / tensor{ 1, 1, 4, 4 }.count())),
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
eltwise("scale", { input_info("concat"), input_info("scale_data") }, eltwise_mode::prod, p.default_type),
reorder("reorder_bfyx", input_info("scale"), cldnn::format::bfyx, p.default_type)
);
tolerance = default_tolerance(p.data_type);
execute(p);
}
class concat_quantize : public ConcatFusingTest {};
TEST_P(concat_quantize, along_f) {
auto p = GetParam();
create_topologies(
input_layout("input0", get_input_layout(p)),
input_layout("input1", get_input_layout(p)),
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
data("out_lo", get_mem(get_single_element_layout(p), 0)),
data("out_hi", get_mem(get_single_element_layout(p), 255)),
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
quantize("quantize", input_info("concat"), input_info("in_lo"), input_info("in_hi"),
input_info("out_lo"), input_info("out_hi"), 256, data_types::u8),
reorder("reorder_bfyx", input_info("quantize"), cldnn::format::bfyx, p.default_type)
);
tolerance = 1.f;
execute(p);
}
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_activation, ::testing::ValuesIn(std::vector<concat_test_params>{
concat_test_params{ CASE_CONCAT_F32_1, 3, 5, "" },
concat_test_params{ CASE_CONCAT_F16_1, 3, 5, "" },
}));
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_eltwise_with_broadcast, ::testing::ValuesIn(std::vector<concat_test_params>{
concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" },
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
}));
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_eltwise_wo_broadcast, ::testing::ValuesIn(std::vector<concat_test_params>{
concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" },
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
}));
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_quantize, ::testing::ValuesIn(std::vector<concat_test_params>{
concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" },
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
}));