[GPU] Added shape agnostic kernels for GatherElements and Tile (#15798)

* [GPU] Added shape agnostic kernel for GatherElements * [GPU] Added shape agnostic kernel for Tile
2023-03-08 08:34:24 +04:00
parent b907bfab3b
commit 681faadce3
13 changed files with 223 additions and 40 deletions
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/gather_elements.cpp
@@ -54,9 +54,9 @@ struct gather_elements_impl : typed_primitive_impl_ocl<gather_elements> {
        return make_unique<gather_elements_impl>(*this);
    }

-    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
        const auto& primitive = impl_param.typed_desc<gather_elements>();
-        auto params = get_default_params<kernel_selector::gather_elements_params>(impl_param);
+        auto params = get_default_params<kernel_selector::gather_elements_params>(impl_param, is_shape_agnostic);
        auto optional_params = get_default_optional_params<kernel_selector::gather_elements_optional_params>(impl_param.get_program());

        size_t rank = impl_param.get_output_layout().get_rank();
@@ -65,24 +65,36 @@ struct gather_elements_impl : typed_primitive_impl_ocl<gather_elements> {
        params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
        return {params, optional_params};
    }
+
+    void update_dispatch_data(const kernel_impl_params& impl_param) override {
+       auto kernel_params = get_kernel_params(impl_param, true);
+       (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
+       update_kernels_list_to_skip();
+    }
 };

 namespace detail {

 attach_gather_elements_impl::attach_gather_elements_impl() {
-    implementation_map<gather_elements>::add(impl_types::ocl, typed_primitive_impl_ocl<gather_elements>::create<gather_elements_impl>, {
-        std::make_tuple(data_types::i8, format::bfyx),
-        std::make_tuple(data_types::u8, format::bfyx),
-        std::make_tuple(data_types::f32, format::bfyx),
-        std::make_tuple(data_types::f16, format::bfyx),
-        std::make_tuple(data_types::i32, format::bfyx),
-        std::make_tuple(data_types::f32, format::bfzyx),
-        std::make_tuple(data_types::f16, format::bfzyx),
-        std::make_tuple(data_types::i32, format::bfzyx),
-        std::make_tuple(data_types::f32, format::bfwzyx),
-        std::make_tuple(data_types::f16, format::bfwzyx),
-        std::make_tuple(data_types::i32, format::bfwzyx),
-    });
+    auto types = {
+        data_types::f32,
+        data_types::f16,
+        data_types::i32,
+        data_types::i8,
+        data_types::u8
+    };
+
+    auto formats = {
+        format::bfyx,
+        format::bfzyx,
+        format::bfwzyx
+    };
+
+    implementation_map<gather_elements>::add(impl_types::ocl,
+                                             shape_types::any,
+                                             typed_primitive_impl_ocl<gather_elements>::create<gather_elements_impl>,
+                                             types,
+                                             formats);
 }

 }  // namespace detail
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/range.cpp
@@ -23,8 +23,8 @@ struct range_impl : typed_primitive_impl_ocl<range> {
        return make_unique<range_impl>(*this);
    }

-    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
-        auto params = get_default_params<kernel_selector::range_params>(impl_param);
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        auto params = get_default_params<kernel_selector::range_params>(impl_param, is_shape_agnostic);
        for (int i : {1, 2})
            params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i)));
        auto optional_params = get_default_optional_params<kernel_selector::range_optional_params>(impl_param.get_program());
@@ -33,8 +33,9 @@ struct range_impl : typed_primitive_impl_ocl<range> {
    }

    void update_dispatch_data(const kernel_impl_params& impl_param) override {
-       auto kernel_params = get_kernel_params(impl_param);
+       auto kernel_params = get_kernel_params(impl_param, true);
       (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
+       update_kernels_list_to_skip();
    }
 };

--- a/src/plugins/intel_gpu/src/graph/impls/ocl/softmax.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/softmax.cpp
@@ -45,9 +45,9 @@ struct softmax_impl : typed_primitive_impl_ocl<softmax> {
        return make_unique<softmax_impl>(*this);
    }

-    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
        const auto& primitive = impl_param.typed_desc<softmax>();
-        auto params = get_default_params<kernel_selector::softmax_params>(impl_param);
+        auto params = get_default_params<kernel_selector::softmax_params>(impl_param, is_shape_agnostic);
        auto optional_params = get_default_optional_params<kernel_selector::softmax_optional_params>(impl_param.get_program());

        size_t rank = impl_param.get_output_layout().get_rank();
@@ -57,7 +57,7 @@ struct softmax_impl : typed_primitive_impl_ocl<softmax> {
    }

    void update_dispatch_data(const kernel_impl_params& impl_param) override {
-        auto kernel_params = get_kernel_params(impl_param);
+        auto kernel_params = get_kernel_params(impl_param, true);
        (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
        update_kernels_list_to_skip();
    }
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/tile.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/tile.cpp
@@ -24,9 +24,9 @@ struct tile_impl : typed_primitive_impl_ocl<tile> {
    }

 public:
-    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
        const auto& primitive = impl_param.typed_desc<tile>();
-        auto params = get_default_params<kernel_selector::tile_params>(impl_param);
+        auto params = get_default_params<kernel_selector::tile_params>(impl_param, is_shape_agnostic);
        auto optional_params = get_default_optional_params<kernel_selector::tile_optional_params>(impl_param.get_program());

        auto repeats = primitive->repeats;
@@ -42,13 +42,19 @@ public:

        return {params, optional_params};
    }
+
+    void update_dispatch_data(const kernel_impl_params& impl_param) override {
+        auto kernel_params = get_kernel_params(impl_param, true);
+        (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
+        update_kernels_list_to_skip();
+    }
 };

 namespace detail {

 attach_tile_impl::attach_tile_impl() {
    auto types = {data_types::i8, data_types::u8, data_types::i32, data_types::f16, data_types::f32};
-    auto formats = {
+    auto static_formats = {
        format::bfyx,
        format::bfzyx,
        format::bfwzyx,
@@ -65,7 +71,23 @@ attach_tile_impl::attach_tile_impl() {
        format::bs_fs_zyx_bsv32_fsv16
    };

-    implementation_map<tile>::add(impl_types::ocl, typed_primitive_impl_ocl<tile>::create<tile_impl>, types, formats);
+    implementation_map<tile>::add(impl_types::ocl,
+                                  shape_types::static_shape,
+                                  typed_primitive_impl_ocl<tile>::create<tile_impl>,
+                                  types,
+                                  static_formats);
+
+    auto dynamic_formats = {
+        format::bfyx,
+        format::bfzyx,
+        format::bfwzyx
+    };
+
+    implementation_map<tile>::add(impl_types::ocl,
+                                  shape_types::dynamic_shape,
+                                  typed_primitive_impl_ocl<tile>::create<tile_impl>,
+                                  types,
+                                  dynamic_formats);
 }

 }  // namespace detail
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gather_elements_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gather_elements_ref.cl
@@ -6,7 +6,8 @@

 #define GET_OUTPUT_INDEX(prefix, idx_order) CAT(prefix, _GET_INDEX)(idx_order)

-KERNEL(gather_elements_ref)(const __global INPUT0_TYPE* data,
+KERNEL(gather_elements_ref)(OPTIONAL_SHAPE_INFO_ARG
+                   const __global INPUT0_TYPE* data,
                   const __global INPUT1_TYPE* indices,
                   __global OUTPUT_TYPE* output
 #if HAS_FUSED_OPS_DECLS
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/tile_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/tile_ref.cl
@@ -4,7 +4,9 @@

 #include "include/batch_headers/fetch_data.cl"

-KERNEL(tile_ref)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output)
+KERNEL(tile_ref)(OPTIONAL_SHAPE_INFO_ARG
+                 const __global INPUT0_TYPE* input,
+                 __global OUTPUT_TYPE* output)
 {
    const uint x = (uint)get_global_id(0) % OUTPUT_SIZE_X;
    const uint y = (uint)get_global_id(0) / OUTPUT_SIZE_X;
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_elements_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_elements_kernel_ref.cpp
@@ -55,6 +55,7 @@ ParamsKey GatherElementsKernelRef::GetSupportedKey() const {
    k.EnableTensorPitches();
    k.EnableBatching();
    k.EnableDifferentTypes();
+    k.EnableDynamicShapesSupport();
    return k;
 }

@@ -71,7 +72,7 @@ static inline std::vector<std::string> GetDefaultOrder(size_t size) {
    return default_order;
 }

-CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_params& params, const optional_params&) const {
+CommonDispatchData GatherElementsKernelRef::SetDefault(const gather_elements_params& params) const {
    CommonDispatchData dispatchData;
    auto in_layout = params.inputs[0].GetLayout();
    auto out_layout = params.outputs[0].GetLayout();
@@ -134,10 +135,10 @@ bool GatherElementsKernelRef::Validate(const Params& p, const optional_params& o
    }

    const gather_elements_params& params = static_cast<const gather_elements_params&>(p);
-    auto input_dims = params.inputs[0].LogicalDims();
-    auto indices_dims = params.inputs[1].LogicalDims();
+    size_t input_rank = params.inputs[0].GetDims().size();
+    size_t indices_rank = params.inputs[1].GetDims().size();

-    if (input_dims.size() != indices_dims.size()) {
+    if (input_rank != indices_rank) {
        return false;
    }

@@ -157,13 +158,22 @@ KernelsData GatherElementsKernelRef::GetKernelsData(const Params& params, const
    KernelData kd = KernelData::Default<gather_elements_params>(params);
    gather_elements_params& newParams = *static_cast<gather_elements_params*>(kd.params.get());

-    auto dispatchData = SetDefault(newParams, options);
+    auto dispatchData = SetDefault(newParams);
    auto cldnn_jit = GetJitConstants(newParams);
-
    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
+        const auto& prim_params = static_cast<const gather_elements_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+        OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
+        kd.kernels[0].params.workGroups.global = dispatchData.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.lws;
+    };
+
    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2, GetFusedPrimitiveInputsCount(params));
+    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
+                     "", false, false, 2, GetFusedPrimitiveInputsCount(params), 1, newParams.has_dynamic_tensors());
    return { kd };
 }

--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_elements_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_elements_kernel_ref.h
@@ -28,7 +28,7 @@ public:
    GatherElementsKernelRef() : KernelBaseOpenCL("gather_elements_ref") {}
    virtual ~GatherElementsKernelRef() {}
    virtual JitConstants GetJitConstants(const gather_elements_params& params) const;
-    virtual CommonDispatchData SetDefault(const gather_elements_params& params, const optional_params&) const;
+    virtual CommonDispatchData SetDefault(const gather_elements_params& params) const;
    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
    ParamsKey GetSupportedKey() const override;
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/range/range_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/range/range_kernel_ref.cpp
@@ -35,7 +35,7 @@ KernelsData RangeKernelRef::GetKernelsData(const Params &params, const optional_
    auto jit = CreateJit(kernelName, jit_constants, entry_point);

    kernel_data.update_dispatch_data_func = [](const Params& params, KernelData& kd) {
-    const auto& prim_params = static_cast<const range_params&>(params);
+        const auto& prim_params = static_cast<const range_params&>(params);
        auto dispatchData = SetDefault(prim_params);
        OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
        kd.kernels[0].params.workGroups.global = dispatchData.gws;
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/tile/tile_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/tile/tile_kernel_ref.cpp
@@ -25,10 +25,11 @@ ParamsKey TileKernelRef::GetSupportedKey() const {
    k.EnableTensorOffset();
    k.EnableTensorPitches();
    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
    return k;
 }

-CommonDispatchData TileKernelRef::SetDefault(const tile_params& params, const optional_params&) const {
+CommonDispatchData TileKernelRef::SetDefault(const tile_params& params) const {
    CommonDispatchData dispatchData;
    auto in_layout = params.inputs[0].GetLayout();
    auto out_layout = params.outputs[0].GetLayout();
@@ -55,14 +56,23 @@ KernelsData TileKernelRef::GetKernelsData(const Params& params, const optional_p
    KernelData kd = KernelData::Default<tile_params>(params);
    tile_params& newParams = *static_cast<tile_params*>(kd.params.get());

-    auto dispatchData = SetDefault(newParams, options);
+    auto dispatchData = SetDefault(newParams);
    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
    auto cldnn_jit = GetJitConstants(newParams);
    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
+        const auto& prim_params = static_cast<const tile_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+        OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
+        kd.kernels[0].params.workGroups.global = dispatchData.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.lws;
+    };
+
    auto& kernel = kd.kernels[0];

-    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point);
+    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point,
+                     EXE_MODE_DEFAULT, false, false, 1, 0, 1, newParams.has_dynamic_tensors());

    return {kd};
 }
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/tile/tile_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/tile/tile_kernel_ref.h
@@ -27,7 +27,7 @@ public:
    virtual ~TileKernelRef() {}

    virtual JitConstants GetJitConstants(const tile_params& params) const;
-    virtual CommonDispatchData SetDefault(const tile_params& params, const optional_params&) const;
+    virtual CommonDispatchData SetDefault(const tile_params& params) const;
    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
    ParamsKey GetSupportedKey() const override;
--- a/src/plugins/intel_gpu/tests/test_cases/gather_elements_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/gather_elements_gpu_test.cpp
@@ -10,6 +10,8 @@
 #include <intel_gpu/graph/topology.hpp>
 #include <intel_gpu/graph/network.hpp>

+#include "gather_elements_inst.h"
+
 #include <cstddef>
 #include <gtest/gtest.h>

@@ -1237,3 +1239,79 @@ TEST(gather_elements_gpu_fp16, export_import) {

    DoTest(engine, input0, input1, expected_results, tensor(2, 3, 3, 1, 1, 5), axis, true);
 }
+
+TEST(gather_elements_gpu, dynamic) {
+    auto& engine = get_test_engine();
+
+    auto axis = 3;
+
+    ov::Shape in0_shape = { 1, 2, 1, 5, 2, 4 };
+    ov::Shape in1_shape = { 1, 2, 1, 2, 2, 4 };
+
+    auto in0_dyn_layout = layout{ov::PartialShape::dynamic(in0_shape.size()), data_types::u8, format::bfwzyx};
+    auto in1_dyn_layout = layout{ov::PartialShape::dynamic(in1_shape.size()), data_types::u8, format::bfwzyx};
+
+    auto input0 = engine.allocate_memory({in0_shape, data_types::u8, format::bfwzyx}); // data
+    auto input1 = engine.allocate_memory({in1_shape, data_types::u8, format::bfwzyx}); // indices
+
+    set_values<uint8_t>(input0, {
+        0,  1,  8,  5,  5,  2,  0,  7,
+        7,  10, 4,  5,  9,  0,  0,  5,
+        7,  0,  4,  0,  4,  7,  6,  10,
+        9,  5,  1,  7,  4,  7,  10, 8,
+        2,  0,  8,  3,  6,  8,  10, 4,
+        2,  10, 7,  8,  7,  0,  6,  9,
+        2,  4,  8,  5,  2,  3,  3,  1,
+        5,  9,  10, 0,  9,  5,  5,  3,
+        10, 5,  2,  0,  10, 0,  5,  4,
+        3,  10, 5,  5,  10, 0,  8,  8
+    });
+
+    set_values<uint8_t>(input1, {
+        0, 2, 4, 3,
+        4, 0, 0, 1,
+        4, 0, 1, 0,
+        1, 0, 1, 1,
+        3, 1, 4, 2,
+        4, 2, 1, 3,
+        2, 1, 2, 4,
+        1, 0, 2, 4
+    });
+
+    std::vector<uint8_t> expected_results = {
+        0,  0,  8,  7,
+        6,  2,  0,  5,
+        2,  1,  4,  5,
+        9,  2,  0,  5,
+        10, 4,  5,  0,
+        10, 5,  3,  4,
+        5,  4,  10, 5,
+        2,  0,  5,  8
+    };
+
+    topology topology;
+    topology.add(input_layout("InputData", in0_dyn_layout));
+    topology.add(input_layout("InputIndices", in1_dyn_layout));
+    topology.add(gather_elements("gather_elements", input_info("InputData"), input_info("InputIndices"), axis));
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+
+    network.set_input_data("InputData", input0);
+    network.set_input_data("InputIndices", input1);
+
+    auto inst = network.get_primitive("gather_elements");
+    auto impl = inst->get_impl();
+    ASSERT_TRUE(impl != nullptr);
+    ASSERT_TRUE(impl->is_dynamic());
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("gather_elements").get_memory();
+    cldnn::mem_lock<uint8_t> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        ASSERT_EQ(expected_results[i], output_ptr[i]);
+    }
+}
--- a/src/plugins/intel_gpu/tests/test_cases/tile_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/tile_gpu_test.cpp
@@ -6,6 +6,7 @@

 #include <intel_gpu/primitives/input_layout.hpp>
 #include <intel_gpu/primitives/tile.hpp>
+#include "tile_inst.h"

 #include <iostream>

@@ -275,6 +276,52 @@ TEST_F(tile_gpu, basic_in1x2x2x2_axis_z) {
    this->test_basic_in1x2x2x2_axis_z(false);
 }

+TEST_F(tile_gpu, dynamic) {
+    auto& engine = get_test_engine();
+
+    ov::Shape input_shape = { 1, 2, 2, 2 };
+    auto input_dyn_layout = layout{ ov::PartialShape::dynamic(input_shape.size()), data_types::f32, format::bfyx };
+    auto input = engine.allocate_memory({ input_shape, data_types::f32, format::bfyx });
+
+    set_values(input, { 1.f, 0.f,
+                        5.f, 1.5f,
+                        2.f, 0.f,
+                        6.f, 5.2f });
+
+    topology topology;
+    topology.add(input_layout("input", input_dyn_layout));
+    topology.add(tile("tile", input_info("input"), std::vector<int64_t>{ 1, 2, 1, 1 }));
+
+    ExecutionConfig config;
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+    network.set_input_data("input", input);
+
+    auto inst = network.get_primitive("tile");
+    auto impl = inst->get_impl();
+    ASSERT_TRUE(impl != nullptr);
+    ASSERT_TRUE(impl->is_dynamic());
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("tile").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    std::vector<float> ref_data = { 1.f, 0.f,
+                                    5.f, 1.5f,
+                                    2.f, 0.f,
+                                    6.f, 5.2f,
+
+                                    1.f, 0.f,
+                                    5.f, 1.5f,
+                                    2.f, 0.f,
+                                    6.f, 5.2f };
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_EQ(output_ptr[i], ref_data[i]) << "Index=" << i;
+    }
+}
+
 namespace {
 template<typename T>
 struct Params {