[GPU] Extended SupportedFusedOps for Concatenation (#20096)
This commit is contained in:
parent
ba6a676484
commit
99de7818be
@ -6,7 +6,13 @@
|
||||
|
||||
#define GET_INDEX(prefix, ORDER) CAT(prefix, _GET_INDEX)(ORDER)
|
||||
|
||||
KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
|
||||
KERNEL(concatenation_gpu_ref)(__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
uint output_offset_in_concat_axis
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint d1 = (uint)get_global_id(0); // Y
|
||||
const uint d2 = (uint)get_global_id(1); // F
|
||||
@ -20,6 +26,14 @@ KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE
|
||||
{
|
||||
uint input_offset = GET_INDEX(INPUT0, INPUT_DIMS_ORDER);
|
||||
uint output_offset = GET_INDEX(OUTPUT, OUTPUT_DIMS_ORDER);
|
||||
output[output_offset] = ACTIVATION(TO_OUTPUT_TYPE(input[input_offset]), ACTIVATION_PARAMS);
|
||||
|
||||
INPUT0_TYPE result = input[input_offset];
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
output[output_offset] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT);
|
||||
#else
|
||||
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(result, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,11 @@ KERNEL (concatenation_gpu_ref)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
uint output_offset_in_concat_axis)
|
||||
uint output_offset_in_concat_axis
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint x = (uint)get_global_id(0) % INPUT0_SIZE_X;
|
||||
const uint y = (uint)get_global_id(0) / INPUT0_SIZE_X;
|
||||
@ -43,5 +47,12 @@ KERNEL (concatenation_gpu_ref)(
|
||||
uint input_offset = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, x);
|
||||
uint output_offset = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_w, out_z, out_y, out_x);
|
||||
|
||||
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(input[input_offset], ACTIVATION_PARAMS));
|
||||
INPUT0_TYPE result = input[input_offset];
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
output[output_offset] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT);
|
||||
#else
|
||||
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(result, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
}
|
||||
|
@ -83,13 +83,16 @@ JitConstants ConcatenationKernelRef::GetJitConstants(const concatenation_params&
|
||||
|
||||
std::string input_dims_order = "";
|
||||
std::string output_dims_order = "";
|
||||
for (size_t i = 0; i < dims_id.size(); i++) {
|
||||
input_dims_order += dims_id[i] + (i == dims_id.size() - 1 ? "" : ",");
|
||||
if (axis_order[i] == axis)
|
||||
output_dims_order += "(" + dims_id[i] + " + output_offset_in_concat_axis)" +
|
||||
(i == dims_id.size() - 1 ? "" : ",");
|
||||
else
|
||||
output_dims_order += dims_id[i] + (i == dims_id.size() - 1 ? "" : ",");
|
||||
|
||||
for (size_t i = 0; i < dims_id.size(); ++i) {
|
||||
std::string separator = i == dims_id.size() - 1 ? "" : ",";
|
||||
input_dims_order += dims_id[i] + separator;
|
||||
|
||||
if (axis_order[i] == axis) {
|
||||
output_dims_order += "(" + dims_id[i] + " + output_offset_in_concat_axis)" + separator;
|
||||
} else {
|
||||
output_dims_order += dims_id[i] + separator;
|
||||
}
|
||||
}
|
||||
|
||||
cldnnJit.AddConstant(MakeJitConstant("INPUT_DIMS_ORDER", input_dims_order));
|
||||
@ -97,6 +100,14 @@ JitConstants ConcatenationKernelRef::GetJitConstants(const concatenation_params&
|
||||
|
||||
cldnnJit.AddConstant(MakeJitConstant("INPUT_DIM_0", DataTensor::Channelndex(input_format, Tensor::DataChannelName::X)));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto idx_order = dims_id;
|
||||
size_t axis_idx = std::distance(axis_order.begin(), std::find(axis_order.begin(), axis_order.end(), axis));
|
||||
idx_order[axis_idx] = "(" + idx_order[axis_idx] + " + output_offset_in_concat_axis)";
|
||||
|
||||
auto conf = FusedOpsConfiguration("", idx_order, "result", params.inputs[0].GetDType());
|
||||
cldnnJit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
return cldnnJit;
|
||||
}
|
||||
|
||||
|
@ -21,7 +21,10 @@ protected:
|
||||
JitConstants GetJitConstants(const concatenation_params& params) const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return {
|
||||
FusedOpType::REORDER
|
||||
FusedOpType::REORDER,
|
||||
FusedOpType::ACTIVATION,
|
||||
FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE
|
||||
};
|
||||
}
|
||||
};
|
||||
|
@ -99,6 +99,26 @@ ConcatenationKernelBase::DispatchData ConcatenationKernel_simple_Ref::SetDefault
|
||||
return dispatchData;
|
||||
}
|
||||
|
||||
JitConstants ConcatenationKernel_simple_Ref::GetJitConstants(const concatenation_params& params) const {
|
||||
auto jit = ConcatenationKernelBase::GetJitConstants(params);
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
const auto& output = params.outputs[0];
|
||||
std::vector<std::string> idx_order;
|
||||
|
||||
if (output.Dimentions() == 6) {
|
||||
idx_order = { "out_b", "out_f", "out_w", "out_z", "out_y", "out_x" };
|
||||
} else if (output.Dimentions() == 5) {
|
||||
idx_order = { "out_b", "out_f", "out_z", "out_y", "out_x" };
|
||||
} else {
|
||||
idx_order = { "out_b", "out_f", "out_y", "out_x" };
|
||||
}
|
||||
auto conf = FusedOpsConfiguration("", idx_order, "result", params.inputs[0].GetDType());
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConcatenationKernel_simple_Ref::GetKernelsData(const Params& params, const optional_params& optParams) const {
|
||||
KernelsData kd = GetCommonKernelsData(params, optParams);
|
||||
return kd;
|
||||
|
@ -15,6 +15,7 @@ public:
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
|
||||
JitConstants GetJitConstants(const concatenation_params& params) const override;
|
||||
DispatchData SetDefault(const concatenation_params& params) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
|
||||
@ -22,7 +23,10 @@ protected:
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return {
|
||||
FusedOpType::REORDER
|
||||
FusedOpType::REORDER,
|
||||
FusedOpType::ACTIVATION,
|
||||
FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE
|
||||
};
|
||||
}
|
||||
};
|
||||
|
@ -15,10 +15,8 @@
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
namespace {
|
||||
struct concat_test_params {
|
||||
tensor in_shape;
|
||||
ov::PartialShape in_shape;
|
||||
data_types data_type;
|
||||
format input_format;
|
||||
data_types default_type;
|
||||
@ -28,6 +26,8 @@ struct concat_test_params {
|
||||
std::string kernel_name;
|
||||
};
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
namespace {
|
||||
class ConcatOneDNNFusingTest : public ::BaseFusingTest<concat_test_params> {
|
||||
public:
|
||||
void execute(concat_test_params& p) {
|
||||
@ -79,11 +79,11 @@ public:
|
||||
}
|
||||
|
||||
layout get_input_layout(concat_test_params& p) {
|
||||
return layout{ p.data_type, p.input_format, p.in_shape };
|
||||
return layout{ p.in_shape, p.data_type, p.input_format };
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(concat_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } };
|
||||
return layout{ { 1, p.in_shape[1] }, p.default_type, p.default_format };
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
@ -116,7 +116,7 @@ TEST_P(concat_onednn_activation, along_f) {
|
||||
class concat_onednn_eltwise : public ConcatOneDNNFusingTest {};
|
||||
TEST_P(concat_onednn_eltwise, along_f) {
|
||||
auto p = GetParam();
|
||||
layout data_layout(p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0]*2, 1, 1 });
|
||||
layout data_layout({ 1, p.in_shape[1] * 2 }, p.default_type, p.default_format);
|
||||
|
||||
create_topologies(
|
||||
input_layout("input0", get_input_layout(p)),
|
||||
@ -144,3 +144,151 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_onednn_eltwise, ::testing::ValuesIn
|
||||
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
|
||||
}));
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
class ConcatFusingTest : public ::BaseFusingTest<concat_test_params> {
|
||||
public:
|
||||
void execute(concat_test_params& p) {
|
||||
auto input0_prim = get_mem(get_input_layout(p));
|
||||
auto input1_prim = get_mem(get_input_layout(p));
|
||||
|
||||
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
|
||||
network network_fused(this->engine, this->topology_fused, cfg_fused);
|
||||
|
||||
network_not_fused.set_input_data("input0", input0_prim);
|
||||
network_not_fused.set_input_data("input1", input1_prim);
|
||||
|
||||
network_fused.set_input_data("input0", input0_prim);
|
||||
network_fused.set_input_data("input1", input1_prim);
|
||||
|
||||
ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
|
||||
ASSERT_FALSE(network_fused.get_primitives_info().empty());
|
||||
|
||||
auto find_and_check = [&](primitive_info& p) -> bool {
|
||||
return p.original_id == "concat" || p.original_id == "reorder_bfyx";
|
||||
};
|
||||
|
||||
auto pi_fused = network_fused.get_primitives_info();
|
||||
auto pi_not_fused = network_not_fused.get_primitives_info();
|
||||
|
||||
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_and_check);
|
||||
auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_and_check);
|
||||
|
||||
ASSERT_TRUE(info_fused != pi_fused.end());
|
||||
ASSERT_TRUE(info_not_fused != pi_not_fused.end());
|
||||
|
||||
compare(network_not_fused, network_fused, p);
|
||||
}
|
||||
|
||||
layout get_input_layout(concat_test_params& p) {
|
||||
return layout{ p.in_shape, p.data_type, p.input_format };
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(concat_test_params& p) {
|
||||
return layout{ { 1, p.in_shape[1] }, p.default_type, p.default_format };
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
/* --------------------------------------- Concat cases ------------------------------------------------ */
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
#define CASE_CONCAT_F32_1 { 1, 8, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONCAT_F16_1 { 1, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx
|
||||
|
||||
class concat_activation : public ConcatFusingTest {};
|
||||
TEST_P(concat_activation, along_f) {
|
||||
auto p = GetParam();
|
||||
|
||||
if (engine.get_device_info().supports_immad)
|
||||
p.expected_fused_primitives++;
|
||||
|
||||
create_topologies(
|
||||
input_layout("input0", get_input_layout(p)),
|
||||
input_layout("input1", get_input_layout(p)),
|
||||
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
|
||||
activation("act1", input_info("concat"), activation_func::round_half_to_even),
|
||||
activation("act2", input_info("act1"), activation_func::clamp, { -0.5f, 0.5f }),
|
||||
reorder("reorder_bfyx", input_info("act2"), cldnn::format::bfyx, p.default_type)
|
||||
);
|
||||
|
||||
tolerance = default_tolerance(p.data_type);
|
||||
execute(p);
|
||||
}
|
||||
|
||||
class concat_eltwise_with_broadcast : public ConcatFusingTest {};
|
||||
TEST_P(concat_eltwise_with_broadcast, along_f) {
|
||||
auto p = GetParam();
|
||||
layout data_layout({ 1, p.in_shape[1] * 2 }, p.default_type, p.default_format);
|
||||
create_topologies(
|
||||
input_layout("input0", get_input_layout(p)),
|
||||
input_layout("input1", get_input_layout(p)),
|
||||
data("scale_data", get_mem(data_layout, 1.0f / tensor{ 1, 1, 4, 4 }.count())),
|
||||
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
|
||||
eltwise("scale", { input_info("concat"), input_info("scale_data") }, eltwise_mode::prod, p.default_type),
|
||||
reorder("reorder_bfyx", input_info("scale"), cldnn::format::bfyx, p.default_type)
|
||||
);
|
||||
|
||||
tolerance = default_tolerance(p.data_type);
|
||||
execute(p);
|
||||
}
|
||||
|
||||
class concat_eltwise_wo_broadcast : public ConcatFusingTest {};
|
||||
TEST_P(concat_eltwise_wo_broadcast, along_f) {
|
||||
auto p = GetParam();
|
||||
ov::PartialShape concatenated_shape = p.in_shape;
|
||||
concatenated_shape[1] *= 2;
|
||||
layout data_layout(concatenated_shape, p.default_type, p.default_format);
|
||||
create_topologies(
|
||||
input_layout("input0", get_input_layout(p)),
|
||||
input_layout("input1", get_input_layout(p)),
|
||||
data("scale_data", get_mem(data_layout, 1.0f / tensor{ 1, 1, 4, 4 }.count())),
|
||||
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
|
||||
eltwise("scale", { input_info("concat"), input_info("scale_data") }, eltwise_mode::prod, p.default_type),
|
||||
reorder("reorder_bfyx", input_info("scale"), cldnn::format::bfyx, p.default_type)
|
||||
);
|
||||
|
||||
tolerance = default_tolerance(p.data_type);
|
||||
execute(p);
|
||||
}
|
||||
|
||||
class concat_quantize : public ConcatFusingTest {};
|
||||
TEST_P(concat_quantize, along_f) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input0", get_input_layout(p)),
|
||||
input_layout("input1", get_input_layout(p)),
|
||||
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 255)),
|
||||
concatenation("concat", { input_info("input0"), input_info("input1") }, 1, p.data_type),
|
||||
quantize("quantize", input_info("concat"), input_info("in_lo"), input_info("in_hi"),
|
||||
input_info("out_lo"), input_info("out_hi"), 256, data_types::u8),
|
||||
reorder("reorder_bfyx", input_info("quantize"), cldnn::format::bfyx, p.default_type)
|
||||
);
|
||||
|
||||
tolerance = 1.f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_activation, ::testing::ValuesIn(std::vector<concat_test_params>{
|
||||
concat_test_params{ CASE_CONCAT_F32_1, 3, 5, "" },
|
||||
concat_test_params{ CASE_CONCAT_F16_1, 3, 5, "" },
|
||||
}));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_eltwise_with_broadcast, ::testing::ValuesIn(std::vector<concat_test_params>{
|
||||
concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" },
|
||||
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
|
||||
}));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_eltwise_wo_broadcast, ::testing::ValuesIn(std::vector<concat_test_params>{
|
||||
concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" },
|
||||
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
|
||||
}));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_quantize, ::testing::ValuesIn(std::vector<concat_test_params>{
|
||||
concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" },
|
||||
concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" },
|
||||
}));
|
||||
|
Loading…
Reference in New Issue
Block a user