[GPU] Added shape agnostic kernels for GatherElements and Tile (#15798)

* [GPU] Added shape agnostic kernel for GatherElements

* [GPU] Added shape agnostic kernel for Tile
This commit is contained in:
Roman Lyamin
2023-03-08 08:34:24 +04:00
committed by GitHub
parent b907bfab3b
commit 681faadce3
13 changed files with 223 additions and 40 deletions

View File

@@ -54,9 +54,9 @@ struct gather_elements_impl : typed_primitive_impl_ocl<gather_elements> {
return make_unique<gather_elements_impl>(*this);
}
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<gather_elements>();
auto params = get_default_params<kernel_selector::gather_elements_params>(impl_param);
auto params = get_default_params<kernel_selector::gather_elements_params>(impl_param, is_shape_agnostic);
auto optional_params = get_default_optional_params<kernel_selector::gather_elements_optional_params>(impl_param.get_program());
size_t rank = impl_param.get_output_layout().get_rank();
@@ -65,24 +65,36 @@ struct gather_elements_impl : typed_primitive_impl_ocl<gather_elements> {
params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
return {params, optional_params};
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}
};
namespace detail {
attach_gather_elements_impl::attach_gather_elements_impl() {
implementation_map<gather_elements>::add(impl_types::ocl, typed_primitive_impl_ocl<gather_elements>::create<gather_elements_impl>, {
std::make_tuple(data_types::i8, format::bfyx),
std::make_tuple(data_types::u8, format::bfyx),
std::make_tuple(data_types::f32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::f32, format::bfzyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::f32, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfwzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
});
auto types = {
data_types::f32,
data_types::f16,
data_types::i32,
data_types::i8,
data_types::u8
};
auto formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
};
implementation_map<gather_elements>::add(impl_types::ocl,
shape_types::any,
typed_primitive_impl_ocl<gather_elements>::create<gather_elements_impl>,
types,
formats);
}
} // namespace detail

View File

@@ -23,8 +23,8 @@ struct range_impl : typed_primitive_impl_ocl<range> {
return make_unique<range_impl>(*this);
}
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
auto params = get_default_params<kernel_selector::range_params>(impl_param);
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
auto params = get_default_params<kernel_selector::range_params>(impl_param, is_shape_agnostic);
for (int i : {1, 2})
params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i)));
auto optional_params = get_default_optional_params<kernel_selector::range_optional_params>(impl_param.get_program());
@@ -33,8 +33,9 @@ struct range_impl : typed_primitive_impl_ocl<range> {
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param);
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}
};

View File

@@ -45,9 +45,9 @@ struct softmax_impl : typed_primitive_impl_ocl<softmax> {
return make_unique<softmax_impl>(*this);
}
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<softmax>();
auto params = get_default_params<kernel_selector::softmax_params>(impl_param);
auto params = get_default_params<kernel_selector::softmax_params>(impl_param, is_shape_agnostic);
auto optional_params = get_default_optional_params<kernel_selector::softmax_optional_params>(impl_param.get_program());
size_t rank = impl_param.get_output_layout().get_rank();
@@ -57,7 +57,7 @@ struct softmax_impl : typed_primitive_impl_ocl<softmax> {
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param);
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}

View File

@@ -24,9 +24,9 @@ struct tile_impl : typed_primitive_impl_ocl<tile> {
}
public:
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
const auto& primitive = impl_param.typed_desc<tile>();
auto params = get_default_params<kernel_selector::tile_params>(impl_param);
auto params = get_default_params<kernel_selector::tile_params>(impl_param, is_shape_agnostic);
auto optional_params = get_default_optional_params<kernel_selector::tile_optional_params>(impl_param.get_program());
auto repeats = primitive->repeats;
@@ -42,13 +42,19 @@ public:
return {params, optional_params};
}
void update_dispatch_data(const kernel_impl_params& impl_param) override {
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
update_kernels_list_to_skip();
}
};
namespace detail {
attach_tile_impl::attach_tile_impl() {
auto types = {data_types::i8, data_types::u8, data_types::i32, data_types::f16, data_types::f32};
auto formats = {
auto static_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx,
@@ -65,7 +71,23 @@ attach_tile_impl::attach_tile_impl() {
format::bs_fs_zyx_bsv32_fsv16
};
implementation_map<tile>::add(impl_types::ocl, typed_primitive_impl_ocl<tile>::create<tile_impl>, types, formats);
implementation_map<tile>::add(impl_types::ocl,
shape_types::static_shape,
typed_primitive_impl_ocl<tile>::create<tile_impl>,
types,
static_formats);
auto dynamic_formats = {
format::bfyx,
format::bfzyx,
format::bfwzyx
};
implementation_map<tile>::add(impl_types::ocl,
shape_types::dynamic_shape,
typed_primitive_impl_ocl<tile>::create<tile_impl>,
types,
dynamic_formats);
}
} // namespace detail

View File

@@ -6,7 +6,8 @@
#define GET_OUTPUT_INDEX(prefix, idx_order) CAT(prefix, _GET_INDEX)(idx_order)
KERNEL(gather_elements_ref)(const __global INPUT0_TYPE* data,
KERNEL(gather_elements_ref)(OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* data,
const __global INPUT1_TYPE* indices,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS

View File

@@ -4,7 +4,9 @@
#include "include/batch_headers/fetch_data.cl"
KERNEL(tile_ref)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output)
KERNEL(tile_ref)(OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output)
{
const uint x = (uint)get_global_id(0) % OUTPUT_SIZE_X;
const uint y = (uint)get_global_id(0) / OUTPUT_SIZE_X;

View File

@@ -55,6 +55,7 @@ ParamsKey GatherElementsKernelRef::GetSupportedKey() const {
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableDynamicShapesSupport();
return k;
}
@@ -71,7 +72,7 @@ static inline std::vector<std::string> GetDefaultOrder(size_t size) {
return default_order;
}
CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_params& params, const optional_params&) const {
CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_params& params) const {
CommonDispatchData dispatchData;
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
@@ -134,10 +135,10 @@ bool GatherElementsKernelRef::Validate(const Params& p, const optional_params& o
}
const gather_elements_params& params = static_cast<const gather_elements_params&>(p);
auto input_dims = params.inputs[0].LogicalDims();
auto indices_dims = params.inputs[1].LogicalDims();
size_t input_rank = params.inputs[0].GetDims().size();
size_t indices_rank = params.inputs[1].GetDims().size();
if (input_dims.size() != indices_dims.size()) {
if (input_rank != indices_rank) {
return false;
}
@@ -157,13 +158,22 @@ KernelsData GatherElementsKernelRef::GetKernelsData(const Params& params, const
KernelData kd = KernelData::Default<gather_elements_params>(params);
gather_elements_params& newParams = *static_cast<gather_elements_params*>(kd.params.get());
auto dispatchData = SetDefault(newParams, options);
auto dispatchData = SetDefault(newParams);
auto cldnn_jit = GetJitConstants(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const gather_elements_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
};
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2, GetFusedPrimitiveInputsCount(params));
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
"", false, false, 2, GetFusedPrimitiveInputsCount(params), 1, newParams.has_dynamic_tensors());
return { kd };
}

View File

@@ -28,7 +28,7 @@ public:
GatherElementsKernelRef() : KernelBaseOpenCL("gather_elements_ref") {}
virtual ~GatherElementsKernelRef() {}
virtual JitConstants GetJitConstants(const gather_elements_params& params) const;
virtual CommonDispatchData SetDefault(const gather_elements_params& params, const optional_params&) const;
virtual CommonDispatchData SetDefault(const gather_elements_params& params) const;
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;

View File

@@ -35,7 +35,7 @@ KernelsData RangeKernelRef::GetKernelsData(const Params &params, const optional_
auto jit = CreateJit(kernelName, jit_constants, entry_point);
kernel_data.update_dispatch_data_func = [](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const range_params&>(params);
const auto& prim_params = static_cast<const range_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;

View File

@@ -25,10 +25,11 @@ ParamsKey TileKernelRef::GetSupportedKey() const {
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDynamicShapesSupport();
return k;
}
CommonDispatchData TileKernelRef::SetDefault(const tile_params& params, const optional_params&) const {
CommonDispatchData TileKernelRef::SetDefault(const tile_params& params) const {
CommonDispatchData dispatchData;
auto in_layout = params.inputs[0].GetLayout();
auto out_layout = params.outputs[0].GetLayout();
@@ -55,14 +56,23 @@ KernelsData TileKernelRef::GetKernelsData(const Params& params, const optional_p
KernelData kd = KernelData::Default<tile_params>(params);
tile_params& newParams = *static_cast<tile_params*>(kd.params.get());
auto dispatchData = SetDefault(newParams, options);
auto dispatchData = SetDefault(newParams);
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
auto cldnn_jit = GetJitConstants(newParams);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const tile_params&>(params);
auto dispatchData = SetDefault(prim_params);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
};
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
EXE_MODE_DEFAULT, false, false, 1, 0, 1, newParams.has_dynamic_tensors());
return {kd};
}

View File

@@ -27,7 +27,7 @@ public:
virtual ~TileKernelRef() {}
virtual JitConstants GetJitConstants(const tile_params& params) const;
virtual CommonDispatchData SetDefault(const tile_params& params, const optional_params&) const;
virtual CommonDispatchData SetDefault(const tile_params& params) const;
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;

View File

@@ -10,6 +10,8 @@
#include <intel_gpu/graph/topology.hpp>
#include <intel_gpu/graph/network.hpp>
#include "gather_elements_inst.h"
#include <cstddef>
#include <gtest/gtest.h>
@@ -1237,3 +1239,79 @@ TEST(gather_elements_gpu_fp16, export_import) {
DoTest(engine, input0, input1, expected_results, tensor(2, 3, 3, 1, 1, 5), axis, true);
}
TEST(gather_elements_gpu, dynamic) {
auto& engine = get_test_engine();
auto axis = 3;
ov::Shape in0_shape = { 1, 2, 1, 5, 2, 4 };
ov::Shape in1_shape = { 1, 2, 1, 2, 2, 4 };
auto in0_dyn_layout = layout{ov::PartialShape::dynamic(in0_shape.size()), data_types::u8, format::bfwzyx};
auto in1_dyn_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::u8, format::bfwzyx};
auto input0 = engine.allocate_memory({in0_shape, data_types::u8, format::bfwzyx}); // data
auto input1 = engine.allocate_memory({in1_shape, data_types::u8, format::bfwzyx}); // indices
set_values<uint8_t>(input0, {
0, 1, 8, 5, 5, 2, 0, 7,
7, 10, 4, 5, 9, 0, 0, 5,
7, 0, 4, 0, 4, 7, 6, 10,
9, 5, 1, 7, 4, 7, 10, 8,
2, 0, 8, 3, 6, 8, 10, 4,
2, 10, 7, 8, 7, 0, 6, 9,
2, 4, 8, 5, 2, 3, 3, 1,
5, 9, 10, 0, 9, 5, 5, 3,
10, 5, 2, 0, 10, 0, 5, 4,
3, 10, 5, 5, 10, 0, 8, 8
});
set_values<uint8_t>(input1, {
0, 2, 4, 3,
4, 0, 0, 1,
4, 0, 1, 0,
1, 0, 1, 1,
3, 1, 4, 2,
4, 2, 1, 3,
2, 1, 2, 4,
1, 0, 2, 4
});
std::vector<uint8_t> expected_results = {
0, 0, 8, 7,
6, 2, 0, 5,
2, 1, 4, 5,
9, 2, 0, 5,
10, 4, 5, 0,
10, 5, 3, 4,
5, 4, 10, 5,
2, 0, 5, 8
};
topology topology;
topology.add(input_layout("InputData", in0_dyn_layout));
topology.add(input_layout("InputIndices", in1_dyn_layout));
topology.add(gather_elements("gather_elements", input_info("InputData"), input_info("InputIndices"), axis));
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("InputData", input0);
network.set_input_data("InputIndices", input1);
auto inst = network.get_primitive("gather_elements");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto outputs = network.execute();
auto output = outputs.at("gather_elements").get_memory();
cldnn::mem_lock<uint8_t> output_ptr(output, get_test_stream());
for (size_t i = 0; i < expected_results.size(); ++i) {
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}

View File

@@ -6,6 +6,7 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/tile.hpp>
#include "tile_inst.h"
#include <iostream>
@@ -275,6 +276,52 @@ TEST_F(tile_gpu, basic_in1x2x2x2_axis_z) {
this->test_basic_in1x2x2x2_axis_z(false);
}
TEST_F(tile_gpu, dynamic) {
auto& engine = get_test_engine();
ov::Shape input_shape = { 1, 2, 2, 2 };
auto input_dyn_layout = layout{ ov::PartialShape::dynamic(input_shape.size()), data_types::f32, format::bfyx };
auto input = engine.allocate_memory({ input_shape, data_types::f32, format::bfyx });
set_values(input, { 1.f, 0.f,
5.f, 1.5f,
2.f, 0.f,
6.f, 5.2f });
topology topology;
topology.add(input_layout("input", input_dyn_layout));
topology.add(tile("tile", input_info("input"), std::vector<int64_t>{ 1, 2, 1, 1 }));
ExecutionConfig config;
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input);
auto inst = network.get_primitive("tile");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto outputs = network.execute();
auto output = outputs.at("tile").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
std::vector<float> ref_data = { 1.f, 0.f,
5.f, 1.5f,
2.f, 0.f,
6.f, 5.2f,
1.f, 0.f,
5.f, 1.5f,
2.f, 0.f,
6.f, 5.2f };
for (size_t i = 0; i < ref_data.size(); ++i) {
ASSERT_EQ(output_ptr[i], ref_data[i]) << "Index=" << i;
}
}
namespace {
template<typename T>
struct Params {