[GPU] Added shape agnostic Pad kernel implementation (#16160)

Signed-off-by: Andrew Park <andrew.park@intel.com>
This commit is contained in:
Andrew Kwangwoong Park
2023-03-09 08:36:43 +09:00
committed by GitHub
parent 3d52fc843a
commit b7ff3a1d64
5 changed files with 160 additions and 113 deletions

View File

@@ -24,9 +24,9 @@ struct border_impl : typed_primitive_impl_ocl<border> {
return make_unique<border_impl>(*this);
}
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<border>();
auto params = get_default_params<kernel_selector::border_params>(impl_param);
auto params = get_default_params<kernel_selector::border_params>(impl_param, is_shape_agnostic);
auto optional_params = get_default_optional_params<kernel_selector::border_optional_params>(impl_param.get_program());
size_t rank = impl_param.get_input_layout(0).get_rank();
@@ -36,7 +36,7 @@ struct border_impl : typed_primitive_impl_ocl<border> {
std::vector<int32_t> end(primitive->pads_end.begin(), primitive->pads_end.end());
size_t input_offset = 1;
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::BEGIN) && !params.has_dynamic_tensors()) {
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::BEGIN)) {
params.begin_type = kernel_selector::base_params::ArgType::Constant;
std::vector<int64_t> begin_vec;
@@ -55,7 +55,7 @@ struct border_impl : typed_primitive_impl_ocl<border> {
input_offset += 1;
}
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::END) && !params.has_dynamic_tensors()) {
if (!(primitive->non_constant_input_mask & border::PAD_NON_CONST_INPUT::END)) {
params.end_type = kernel_selector::base_params::ArgType::Constant;
std::vector<int64_t> end_vec;
@@ -102,108 +102,55 @@ struct border_impl : typed_primitive_impl_ocl<border> {
return {params, optional_params};
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}
};
namespace detail {
attach_border_impl::attach_border_impl() {
implementation_map<border>::add(impl_types::ocl, typed_primitive_impl_ocl<border>::create<border_impl>, {
std::make_tuple(data_types::f32, format::yxfb),
std::make_tuple(data_types::f16, format::yxfb),
std::make_tuple(data_types::i32, format::yxfb),
std::make_tuple(data_types::i8, format::yxfb),
std::make_tuple(data_types::u8, format::yxfb),
auto types = {data_types::f32, data_types::f16, data_types::i32, data_types::i8, data_types::u8};
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),
auto formats = {
format::yxfb,
format::bfyx,
format::byxf,
format::bfzyx,
format::bfwzyx,
format::b_fs_yx_fsv16,
format::b_fs_yx_fsv32,
format::b_fs_zyx_fsv16,
format::bs_fs_yx_bsv4_fsv2,
format::bs_fs_yx_bsv4_fsv4,
format::bs_fs_yx_bsv8_fsv2,
format::bs_fs_yx_bsv8_fsv4,
format::bs_fs_yx_bsv16_fsv16,
format::bs_fs_yx_bsv32_fsv16,
format::bs_fs_yx_bsv32_fsv32,
format::bs_fs_zyx_bsv16_fsv16
};
std::make_tuple(data_types::f32, format::byxf),
std::make_tuple(data_types::f16, format::byxf),
std::make_tuple(data_types::i32, format::byxf),
std::make_tuple(data_types::i8, format::byxf),
std::make_tuple(data_types::u8, format::byxf),
implementation_map<border>::add(impl_types::ocl,
shape_types::static_shape,
typed_primitive_impl_ocl<border>::create<border_impl>,
types,
formats);
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::i8, format::bfzyx),
std::make_tuple(data_types::u8, format::bfzyx),
auto dyn_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
};
std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i8, format::bfwzyx),
std::make_tuple(data_types::u8, format::bfwzyx),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i32, format::b_fs_yx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv16),
std::make_tuple(data_types::f32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f16, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i32, format::b_fs_yx_fsv32),
std::make_tuple(data_types::i8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::u8, format::b_fs_yx_fsv32),
std::make_tuple(data_types::f32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f16, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i32, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::i8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::u8, format::b_fs_zyx_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv4_fsv2),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv4_fsv4),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv8_fsv2),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv8_fsv4),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv16_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16),
std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::i32, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32),
std::make_tuple(data_types::f32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::i32, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::i8, format::bs_fs_zyx_bsv16_fsv16),
std::make_tuple(data_types::u8, format::bs_fs_zyx_bsv16_fsv16),
});
implementation_map<border>::add(impl_types::ocl,
shape_types::dynamic_shape,
typed_primitive_impl_ocl<border>::create<border_impl>,
types,
dyn_formats);
}
} // namespace detail

View File

@@ -5,6 +5,7 @@
#include "include/fetch_utils.cl"
KERNEL(border_gpu_ref)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
#ifdef BEGIN_TYPE
const __global BEGIN_TYPE* begin,
@@ -154,7 +155,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = out_f - blt_sf;
const uint in_b = out_b - blt_sb;
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
in_val = input[in_pos];
}
#elif defined BORDER_TYPE_EDGE
@@ -165,7 +166,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#elif defined BORDER_TYPE_MIRROR
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
@@ -175,7 +176,7 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - 1 - out_f : in_sf + in_lf - 1 - out_f);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - 1 - out_b : in_sb + in_lb - 1 - out_b);
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#elif defined BORDER_TYPE_MIRROR_101
const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - out_x : in_sx + in_lx - 2 - out_x);
@@ -185,12 +186,12 @@ KERNEL(border_gpu_ref)(
const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? blt_sf - out_f : in_sf + in_lf - 2 - out_f);
const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? blt_sb - out_b : in_sb + in_lb - 2 - out_b);
const uint in_pos = FUNC_CALL(get_input_index)(in_b, in_f, in_w, in_z, in_y, in_x);
const uint in_pos = FUNC_CALL(get_input_index)(OPTIONAL_SHAPE_INFO_TENSOR in_b, in_f, in_w, in_z, in_y, in_x);
INPUT0_TYPE in_val = input[in_pos];
#else
#error Unsupported border type.
#endif
const uint out_pos = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
const uint out_pos = FUNC_CALL(get_output_index)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_w, out_z, out_y, out_x);
output[out_pos] = in_val;
}

View File

@@ -46,14 +46,16 @@ BorderKernelBase::DispatchData BorderKernelBase::SetDefault(const border_params&
const auto& output = params.outputs[0];
DispatchData dispatchData;
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Z },
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::W },
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
if (!params.has_dynamic_tensors()) {
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws = {{ Tensor::DataChannelName::X, Tensor::DataChannelName::Z },
{ Tensor::DataChannelName::Y, Tensor::DataChannelName::W },
{ Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH }};
dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v };
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
dispatchData.gws = { output.X().v * output.Z().v, output.Y().v * output.W().v, output.Batch().v * output.Feature().v };
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws);
}
return dispatchData;
}
@@ -67,16 +69,32 @@ KernelsData BorderKernelBase::GetCommonKernelsData(const Params& params,
auto dispatchData = SetDefault(prim_params);
KernelData k_data = KernelData::Default<border_params>(params);
border_params& newParams = *static_cast<border_params*>(k_data.params.get());
k_data.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const border_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
};
auto cldnn_jit = GetJitConstants(prim_params);
auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
auto& kernel = k_data.kernels[0];
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
"", false, false, static_cast<int>(newParams.inputs.size()),
0, 1, newParams.has_dynamic_tensors());
FillCLKernelData(kernel,
dispatchData,
params.engineInfo,
kernelName,
jit,
entry_point,
EXE_MODE_DEFAULT,
false,
false,
(uint32_t)prim_params.inputs.size(),
GetFusedPrimitiveInputsCount(params),
1,
prim_params.outputs[0].is_dynamic());
return {k_data};
}

View File

@@ -27,6 +27,7 @@ ParamsKey BorderKernelRef::GetSupportedKey() const {
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableDynamicShapesSupport();
return k;
}

View File

@@ -7,6 +7,8 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/border.hpp>
#include <border_inst.h>
#include <cstddef>
#include <array>
@@ -1551,3 +1553,81 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_edge) {
}
}
}
TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_constant_dynamic) {
constexpr auto in_size_b = 2;
constexpr auto in_size_f = 3;
constexpr auto in_size_y = 5;
constexpr auto in_size_x = 4;
constexpr auto blt_size_b = 2;
constexpr auto blt_size_f = 1;
constexpr auto blt_size_y = 2;
constexpr auto blt_size_x = 3;
constexpr auto brb_size_b = 1;
constexpr auto brb_size_f = 2;
constexpr auto brb_size_y = 3;
constexpr auto brb_size_x = 4;
constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
auto& engine = get_test_engine();
auto input_layout_dynamic = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
auto input_layout_static = layout{ov::PartialShape{in_size_b, in_size_f, in_size_y, in_size_x}, data_types::f32, format::bfyx};
auto input = engine.allocate_memory(input_layout_static);
topology topology;
topology.add(input_layout("input", input_layout_dynamic));
topology.add(border("border",
{input_info("input")}, 0,
ov::CoordinateDiff{blt_size_b, blt_size_f, blt_size_y, blt_size_x},
ov::CoordinateDiff{brb_size_b, brb_size_f, brb_size_y, brb_size_x},
ov::op::PadMode::CONSTANT,
0.0f));
const std::vector<size_t> sizes{ static_cast<std::size_t>(in_size_b), static_cast<std::size_t>(in_size_f),
static_cast<std::size_t>(in_size_y), static_cast<std::size_t>(in_size_x)};
std::vector<float> input_data = generate_rnd_real_input<float>(sizes, -8.0f, 8.0f);
set_values(input, input_data);
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input);
auto inst = network.get_primitive("border");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "border");
auto output = outputs.at("border").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
for (auto b = 0; b < out_size_b; ++b) { // B
for (auto f = 0; f < out_size_f; ++f) { // F
for (auto y = 0; y < out_size_y; ++y) { // Y
for (auto x = 0; x < out_size_x; ++x) { // X
auto output_off = ((b * out_size_f + f) * out_size_y + y) * out_size_x + x; // BFYX
if (b < blt_size_b || b >= out_size_b - brb_size_b ||
f < blt_size_f || f >= out_size_f - brb_size_f ||
y < blt_size_y || y >= out_size_y - brb_size_y ||
x < blt_size_x || x >= out_size_x - brb_size_x) {
ASSERT_EQ(output_ptr[output_off], 0.0f);
} else {
auto input_off = (((b - blt_size_b) * in_size_f + f - blt_size_f) * in_size_y + y - blt_size_y) * in_size_x + x - blt_size_x; // BFYX
ASSERT_EQ(output_ptr[output_off], input_data[input_off]);
}
}
}
}
}
}