From 9427623046a9eee57c476ba85f3f0ae6290345b7 Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Thu, 5 Jan 2023 14:58:55 +0900 Subject: [PATCH] [GPU] Swap XY axis for 1D conv (#14362) --- .../src/graph/impls/ocl/convolution.cpp | 48 +++++++++++ .../src/graph/kernel_selector_helper.cpp | 1 + .../src/kernel_selector/tensor_type.cpp | 39 +++++++++ .../src/kernel_selector/tensor_type.h | 3 + .../tests/fusions/convolution_fusion_test.cpp | 31 +++++++ .../tests/test_cases/convolution_gpu_test.cpp | 85 +++++++++++++++++++ 6 files changed, 207 insertions(+) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp index 07c5d32c2e5..d9a786e2ee7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp @@ -166,6 +166,54 @@ public: conv_params.quantization = kernel_selector::QuantizationType::NONE; } + auto can_swap_xy = [&](kernel_selector::convolution_params& cp) -> bool { + if (cp.inputs[0].GetLayout() == kernel_selector::Tensor::DataLayout::bfyx + && cp.inputs[0].X().v == 1 && cp.inputs[0].Y().v > 1 + && cp.inputs[0].X().pad.Total() == 0 + && cp.outputs[0].GetLayout() == kernel_selector::Tensor::DataLayout::bfyx + && cp.outputs[0].X().v == 1 && cp.outputs[0].Y().v > 1 + && cp.weights.X().v == 1 && cp.weights.Y().v > 1 + && !((cp.groups == cp.inputs[0].Feature().v && cp.inputs[0].Feature().v == cp.outputs[0].Feature().v && cp.split == 1) + || (cp.split == cp.inputs[0].Feature().v && cp.groups == 1))) { // Don't swap if it is depthwise conv + auto can_swap = [](const kernel_selector::Tensor::DataTensor& dt) -> bool { + auto x_channel_idx = kernel_selector::Tensor::DataTensor::Channelndex(dt.GetLayout(), + kernel_selector::Tensor::DataChannelName::X); + auto x_axis_dim = dt.GetDims()[x_channel_idx]; + return (x_axis_dim.pad.Total() == 0 && x_axis_dim.v == 1); + }; + + for (auto& desc : cp.fused_ops) { + if (!can_swap(desc.output_tensor)) { + return false; + } + for (size_t i = 0; i < desc.tensors.size(); i++) { + if (!can_swap(desc.tensors[i])) { + return false; + } + } + } + return true; + } + return false; + }; + + // Swap XY axes + if (can_swap_xy(conv_params) && primitive->deformable_mode == false) { + conv_params.inputs[0].SwapXY(); + conv_params.outputs[0].SwapXY(); + conv_params.weights.SwapXY(); + for (auto& desc : conv_params.fused_ops) { + desc.output_tensor.SwapXY(); + for (size_t i = 0; i < desc.tensors.size(); i++) { + desc.tensors[i].SwapXY(); + } + } + conv_params.filterSize = { ky, kx, kz }; + conv_params.padding = {pad_y, pad_x, pad_z}; + conv_params.stride = {stride_y, stride_x, stride_z}; + conv_params.dilation = {dilation_y, dilation_x, dilation_z}; + } + auto format = impl_param.get_output_layout().format; if (format == format::b_fs_zyx_fsv16 || format == format::bs_fs_zyx_bsv16_fsv16 || diff --git a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp index 01ede207841..a41066f982c 100644 --- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp @@ -892,6 +892,7 @@ kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_ const auto d = t[tensor_index]; vec[i] = static_cast(d); } + return kernel_selector::weights_tensor(vec, ks_type, ks_layout); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp index 3aa287b7c5e..ffd6a9a3da2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp @@ -550,6 +550,28 @@ DataTensor DataTensor::FlattenEverything() const { return res; } +void DataTensor::SwapXY() { + DataLayout l = Tensor::bfyx; + + auto x = X(); + auto y = Y(); + + if (GetLayout() != DataLayout::bfyx) + throw std::runtime_error("Unsupported - unsupported layout."); + if (x.pad.Total() != 0 || x.v != 1) + throw std::runtime_error("Unsupported - unsupported shape."); + + // Swap XY axes. + y.pitch = 1; + x.pitch = y.v + y.pad.Total(); + std::vector vec(ChannelsCount(l)); + vec[Channelndex(l, DataChannelName::X)] = y; + vec[Channelndex(l, DataChannelName::Y)] = x; + vec[Channelndex(l, DataChannelName::FEATURE)] = Feature(); + vec[Channelndex(l, DataChannelName::BATCH)] = Batch(); + *this = {vec, dtype, l}; +} + NDims WeightsTensor::GetSimpleDims(const std::vector& d, WeightsLayout l) { std::vector newDims = d; @@ -1051,5 +1073,22 @@ WeightsTensor WeightsTensor::TransformIgnorePadding(WeightsLayout l, WeightsType return {vec, t, l}; } + +void WeightsTensor::SwapXY() { + auto x = X(); + + if (x.pad.Total() != 0 || x.v != 1) + throw std::runtime_error("Unsupported - unsupported weight shape."); + + std::vector vec; + for (auto& d : dims) { + vec.push_back(d.v); + } + auto x_index = Channelndex(layout, WeightsChannelName::X); + auto y_index = Channelndex(layout, WeightsChannelName::Y); + std::swap(vec[x_index], vec[y_index]); + *this = {vec, dtype, layout}; +} + } // namespace Tensor } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h index 2a214e7f02c..e2092b03165 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h @@ -616,6 +616,7 @@ struct DataTensor : public TensorBaseT { DataTensor TransformIgnorePadding(DataLayout l) const; DataTensor FlattenFeatureAndSpatials() const; DataTensor FlattenEverything() const; + void SwapXY(); static inline Dim Extract(DataLayout l, DataChannelName channel, const NDims& d) { return TensorBaseT::Extract(dataChannelArray, l, channel, d); @@ -658,6 +659,8 @@ struct WeightsTensor : TensorBaseT { Dim OFM() const { return Extract(layout, WeightsChannelName::OFM, dims); } Dim G() const { return Extract(layout, WeightsChannelName::G, dims); } + void SwapXY(); + static inline Dim Extract(WeightsLayout l, WeightsChannelName channel, const NDims& d) { return TensorBaseT::Extract(weightsChannelArray, l, channel, d); } diff --git a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp index 37db72fd9fc..3e725045fe4 100644 --- a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp @@ -1634,6 +1634,37 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_group_conv_eltwise_sum, ::testin conv_eltw_test_params{ CASE_GROUP_CONV_ELTW_FP32_1, 3, 2, 3 }, })); +class conv_swap_xy_with_eltwise_diff_sizes : public ConvEltwTest {}; +TEST_P(conv_swap_xy_with_eltwise_diff_sizes, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), + convolution("conv_prim", input_info("input"), { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", input_info("conv_prim"), activation_func::relu_negative_slope), + eltwise("sum", { input_info("activation"), input_info("eltwise_data") }, eltwise_mode::sum, data_types::f16), + reorder("reorder_bfyx", input_info("sum"), p.default_format, data_types::f16) + ); + + tolerance = default_tolerance(p.default_type); + execute(p); +} + +// in_shape; out_shape; eltw_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_CONV_ELTW_FP16_SWAP_XY_1 { 1, 16, 1, 5 }, { 1, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx +#define CASE_CONV_ELTW_FP16_SWAP_XY_2 { 1, 16, 1, 5 }, { 1, 32, 1, 7 }, { 1, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::os_iyx_osv16, data_types::f16, format::bfyx +#define CASE_CONV_ELTW_FP32_SWAP_XY_1 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 1, 32, 1, 1 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP32_SWAP_XY_2 { 3, 16, 1, 5 }, { 3, 32, 1, 7 }, { 3, 32, 1, 7 }, { 1, 1, 1, 3 }, { 1, 1 }, { 2, 0 }, { 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_swap_xy_with_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_1, 3, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP16_SWAP_XY_2, 3, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_SWAP_XY_1, 3, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_SWAP_XY_2, 3, 3, 4 }, +})); + class conv_scale_activation_eltwise_fp32_quantize_i8 : public ConvEltwTest {}; TEST_P(conv_scale_activation_eltwise_fp32_quantize_i8, basic) { auto p = GetParam(); diff --git a/src/plugins/intel_gpu/tests/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/convolution_gpu_test.cpp index 14716c116e1..36e04d5b09f 100644 --- a/src/plugins/intel_gpu/tests/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/convolution_gpu_test.cpp @@ -23,6 +23,8 @@ #include #include +#include "convolution_inst.h" + using namespace cldnn; using namespace ::tests; @@ -9319,3 +9321,86 @@ TEST(convolution_f32_gpu, convolution_gpu_bfyx_f16_depthwise_x_bloxk_size_1) { TEST(export_import_convolution_f32_gpu, convolution_gpu_bfyx_f16_depthwise_x_bloxk_size_1) { test_convolution_f32_gpu_convolution_gpu_bfyx_f16_depthwise_x_bloxk_size_1(true); } + + +TEST(convolution_f32_fw_gpu, basic_convolution_no_bias_swap_xy) { + // Filter : 2x2x1x3 + // Stride : 1x1 + // Input : 1x2x1x5 + // Output : 1x2x1x3 + // + // Input: + // 1 1 + // 2 2 + // 3 3 + // 4 4 + // 5 5 + // + // Filter: + // 1 1 1 1 + // 2 2 2 2 + // 1 1 1 1 + // + // Output: + // 16 16 + // 24 24 + // 32 32 + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 2, 1, 5 } }); + auto weights = engine.allocate_memory({ data_types::f32, format::bfyx, { 2, 2, 1, 3 } }); + + set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f }); + set_values(weights, { 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f}); + VVVF output_vec = {{{ 16.0f }, { 24.0f }, { 32.0f }}, {{ 16.0f }, { 24.0f }, { 32.0f }}}; + + topology topology( + input_layout("input", input->get_layout()), + data("weights", weights), + convolution("conv", input_info("input"), { "weights" }, { 1, 1 })); + + network network(engine, topology); + network.set_input_data("input", input); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "conv"); + + const auto& const_net = network; + const auto conv_inst = const_net.get_primitive("conv"); + ASSERT_TRUE(conv_inst != nullptr); + + auto output_memory = outputs.at("conv").get_memory(); + auto output_layout = output_memory->get_layout(); + cldnn::mem_lock output_ptr(output_memory, get_test_stream()); + + int y_size = output_layout.spatial(1); + int x_size = output_layout.spatial(0); + int f_size = output_layout.feature(); + int b_size = output_layout.batch(); + + ASSERT_EQ(output_layout.format, format::bfyx); + ASSERT_EQ(y_size, 3); + ASSERT_EQ(x_size, 1); + ASSERT_EQ(f_size, 2); + ASSERT_EQ(b_size, 1); + + for (int f = 0; f < f_size; ++f) { + for (int y = 0; y < y_size; ++y) { + for (int x = 0; x < x_size; ++x) { + ASSERT_EQ(output_vec[f][y][x], output_ptr[f * y_size + y * x_size + x]); + } + } + } + + auto inst = network.get_primitive("conv"); + const auto& node = inst->get_node(); + auto selected_impl = node.type()->choose_impl(node); + bool found_define = false; + for (auto& s : selected_impl->get_kernels_source()) { + if (s != nullptr && !s->get_str().empty() + && s->get_str().find("#define INPUT0_SIZE_X 5") != std::string::npos) + found_define = true; + } + EXPECT_TRUE(found_define); +}