[GPU] Unique-10 operation implementation. (#16412)

* [GPU] Unique-10 operation implementation. * Handled flattened case. * Created results for all outputs in single layer test. * Save total unique count as fifth output. * Handled axis case. * Added unique reshape kernel. * Moved data types to unique primitive constructor. * Added shape agnostic Unique ref kernel. * Added blocked layout support to Unique-10. * Use int in bubble sort. * Added unit tests. * Added support for blocked layouts to flattened mode. * Fixed usage of shape_info in kernel. * Use correct total data size for dynamic shapes. * Commented some functional tests. For some reasons big shapes cause std::bad_alloc. * Initialize out_counts with zeros. * Implemented new approach for reducing memory footprint. Changed first kernel to only count unique values and changed second kernel to fill all outputs. * Revert "Commented some functional tests." This reverts commit a7f9763c575e71e14b85ee37adf1e98f10785c15. * Fixed calc output layouts for flattened case when rank in greater than 4. * Added temporary fix for axis case when rank is greater than 4. * Revert "Added temporary fix for axis case when rank is greater than 4." This reverts commit 236640d2f0e9d5b1f8dcbbf9482763badd7fde66. * Renamed "unique" to "unique_count" and "unique_reshape" to "unique_gather" primitives. * Quick fix for add_intermediate_node to consider dep_idx of multiple output * Fix bug for multiple output: 1) get_reorder was getting reorder from cache regardless of the dep_idx. 2) remove_redundant_reorder was not considering original dep_idx * Fixed conflicts. * Fixed win build issue. * Fixed build issue. * Revert "Fix bug for multiple output:" This reverts commit d4a2c4f32eabe9108df31d4837fed8995c93bd1c. * Revert "Quick fix for add_intermediate_node to consider dep_idx of multiple output" This reverts commit 2dfd2aaefdf32067a7469505b35f7096632ac5f2. * Added some tests to skip config. --------- Co-authored-by: Taylor Yeonbok Lee <taylor.lee@intel.com>
2023-06-14 20:41:51 +03:00 · 2023-06-14 20:41:51 +03:00 · bae926de22
commit bae926de22
parent 5993c4942a
19 changed files with 1826 additions and 1 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@ -248,6 +248,7 @@ REGISTER_FACTORY(v9, Eye);
 REGISTER_FACTORY(v10, IsFinite);
 REGISTER_FACTORY(v10, IsInf);
 REGISTER_FACTORY(v10, IsNaN);
+REGISTER_FACTORY(v10, Unique);

 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/unique.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/unique.hpp
@ -0,0 +1,88 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vector>
+
+#include "primitive.hpp"
+
+namespace cldnn {
+
+struct unique_count : primitive_base<unique_count> {
+    CLDNN_DECLARE_PRIMITIVE(unique_count)
+
+    /// @brief Constructs unique_count primitive.
+    /// @param id This primitive id.
+    /// @param input Input primitive id.
+    /// @param flattened If true, operator works on a flattened version of the input tensor.
+    /// @param axis Is used to “divide” the input tensor into slices.
+    unique_count(const primitive_id& id, const input_info& input, bool flattened, int64_t axis)
+        : primitive_base(id, {input}),
+          flattened(flattened),
+          axis(axis) {}
+
+    bool flattened;
+    int64_t axis;
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_combine(seed, flattened);
+        seed = hash_combine(seed, axis);
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs)) {
+            return false;
+        }
+        auto rhs_casted = downcast<const unique_count>(rhs);
+        return flattened == rhs_casted.flattened && axis == rhs_casted.axis;
+    }
+};
+
+struct unique_gather : primitive_base<unique_gather> {
+    CLDNN_DECLARE_PRIMITIVE(unique_gather)
+
+    /// @brief Constructs unique_gather primitive.
+    /// @param id This primitive id.
+    /// @param inputs Input primitives ids.
+    /// @param flattened If true, operator works on a flattened version of the input tensor.
+    /// @param axis Is used to “divide” the input tensor into slices.
+    /// @param sorted Controls the order of the returned unique values (sorts ascending when true).
+    unique_gather(const primitive_id& id,
+                  const std::vector<input_info>& inputs,
+                  bool flattened,
+                  int64_t axis,
+                  bool sorted,
+                  data_types elem_type,
+                  data_types index_type,
+                  data_types count_type)
+        : primitive_base(id, inputs, decltype(output_paddings)(4), {elem_type, index_type, index_type, count_type}, 4),
+          flattened(flattened),
+          axis(axis),
+          sorted(sorted) {}
+
+    bool flattened;
+    int64_t axis;
+    bool sorted;
+
+    size_t hash() const override {
+        size_t seed = primitive::hash();
+        seed = hash_combine(seed, flattened);
+        seed = hash_combine(seed, axis);
+        seed = hash_combine(seed, sorted);
+        return seed;
+    }
+
+    bool operator==(const primitive& rhs) const override {
+        if (!compare_common_params(rhs)) {
+            return false;
+        }
+        auto rhs_casted = downcast<const unique_gather>(rhs);
+        return flattened == rhs_casted.flattened && axis == rhs_casted.axis && sorted == rhs_casted.sorted;
+    }
+};
+
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
@ -94,6 +94,8 @@ void register_implementations() {
    REGISTER_OCL(count_nonzero);
    REGISTER_OCL(gather_nonzero);
    REGISTER_OCL(eye);
+    REGISTER_OCL(unique_count);
+    REGISTER_OCL(unique_gather);
 }

 }  // namespace ocl
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
@ -75,6 +75,7 @@
 #include "intel_gpu/primitives/tile.hpp"
 #include "intel_gpu/primitives/non_zero.hpp"
 #include "intel_gpu/primitives/eye.hpp"
+#include "intel_gpu/primitives/unique.hpp"

 namespace cldnn {
 namespace ocl {
@ -174,6 +175,8 @@ REGISTER_OCL(convert_color);
 REGISTER_OCL(count_nonzero);
 REGISTER_OCL(gather_nonzero);
 REGISTER_OCL(eye);
+REGISTER_OCL(unique_count);
+REGISTER_OCL(unique_gather);

 #undef REGISTER_OCL

--- a/src/plugins/intel_gpu/src/graph/impls/ocl/unique.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/unique.cpp
@ -0,0 +1,167 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "primitive_base.hpp"
+#include "unique/unique_kernel_ref.hpp"
+#include "unique/unique_kernel_selector.hpp"
+#include "unique_inst.hpp"
+
+namespace cldnn {
+namespace ocl {
+
+struct unique_count_impl : typed_primitive_impl_ocl<unique_count> {
+    using parent = typed_primitive_impl_ocl<unique_count>;
+    using parent::parent;
+    using kernel_selector_t = kernel_selector::unique_count_kernel_selector;
+    using kernel_params_t =
+        std::pair<kernel_selector::unique_count_params, kernel_selector::unique_count_optional_params>;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<unique_count_impl>(*this);
+    }
+
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        const auto& primitive = impl_param.typed_desc<unique_count>();
+        auto params = get_default_params<kernel_selector::unique_count_params>(impl_param, is_shape_agnostic);
+        auto optional_params =
+            get_default_optional_params<kernel_selector::unique_count_optional_params>(impl_param.get_program());
+
+        params.flattened = primitive->flattened;
+        params.axis = primitive->axis;
+
+        return {params, optional_params};
+    }
+
+    void update_dispatch_data(const kernel_impl_params& impl_param) override {
+        auto kernel_params = get_kernel_params(impl_param, true);
+        (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
+    }
+};
+
+namespace detail {
+
+attach_unique_count_impl::attach_unique_count_impl() {
+    auto types = {
+        data_types::u8,
+        data_types::i8,
+        data_types::f16,
+        data_types::f32,
+        data_types::i32,
+        data_types::i64,
+    };
+
+    auto formats = {
+        format::bfyx,
+        format::b_fs_yx_fsv16,
+        format::b_fs_yx_fsv32,
+        format::bs_fs_yx_bsv16_fsv16,
+        format::bs_fs_yx_bsv16_fsv32,
+        format::bs_fs_yx_bsv32_fsv16,
+        format::bs_fs_yx_bsv32_fsv32,
+
+        format::bfzyx,
+        format::b_fs_zyx_fsv16,
+        format::b_fs_zyx_fsv32,
+        format::bs_fs_zyx_bsv16_fsv16,
+        format::bs_fs_zyx_bsv16_fsv32,
+        format::bs_fs_zyx_bsv32_fsv16,
+        format::bs_fs_zyx_bsv32_fsv32,
+
+        format::bfwzyx,
+    };
+
+    implementation_map<unique_count>::add(impl_types::ocl,
+                                          shape_types::any,
+                                          typed_primitive_impl_ocl<unique_count>::create<unique_count_impl>,
+                                          types,
+                                          formats);
+}
+}  // namespace detail
+
+struct unique_gather_impl : typed_primitive_impl_ocl<unique_gather> {
+    using parent = typed_primitive_impl_ocl<unique_gather>;
+    using parent::parent;
+    using kernel_selector_t = kernel_selector::unique_gather_kernel_selector;
+    using kernel_params_t =
+        std::pair<kernel_selector::unique_gather_params, kernel_selector::unique_gather_optional_params>;
+
+    DECLARE_OBJECT_TYPE_SERIALIZATION
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<unique_gather_impl>(*this);
+    }
+
+    static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        const auto& primitive = impl_param.typed_desc<unique_gather>();
+        auto params = get_default_params<kernel_selector::unique_gather_params>(impl_param, is_shape_agnostic);
+        auto optional_params =
+            get_default_optional_params<kernel_selector::unique_gather_optional_params>(impl_param.get_program());
+
+        params.flattened = primitive->flattened;
+        params.axis = primitive->axis;
+        params.sorted = primitive->sorted;
+
+        for (auto i = 1U; i < impl_param.input_layouts.size(); ++i) {
+            params.inputs.push_back(convert_data_tensor(impl_param.input_layouts.at(i)));
+        }
+
+        for (auto i = 1U; i < impl_param.output_layouts.size(); ++i) {
+            params.outputs.push_back(convert_data_tensor(impl_param.output_layouts.at(i)));
+        }
+
+        return {params, optional_params};
+    }
+
+    void update_dispatch_data(const kernel_impl_params& impl_param) override {
+        auto kernel_params = get_kernel_params(impl_param, true);
+        (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
+    }
+};
+
+namespace detail {
+
+attach_unique_gather_impl::attach_unique_gather_impl() {
+    auto types = {
+        data_types::u8,
+        data_types::i8,
+        data_types::f16,
+        data_types::f32,
+        data_types::i32,
+        data_types::i64,
+    };
+
+    auto formats = {
+        format::bfyx,
+        format::b_fs_yx_fsv16,
+        format::b_fs_yx_fsv32,
+        format::bs_fs_yx_bsv16_fsv16,
+        format::bs_fs_yx_bsv16_fsv32,
+        format::bs_fs_yx_bsv32_fsv16,
+        format::bs_fs_yx_bsv32_fsv32,
+
+        format::bfzyx,
+        format::b_fs_zyx_fsv16,
+        format::b_fs_zyx_fsv32,
+        format::bs_fs_zyx_bsv16_fsv16,
+        format::bs_fs_zyx_bsv16_fsv32,
+        format::bs_fs_zyx_bsv32_fsv16,
+        format::bs_fs_zyx_bsv32_fsv32,
+
+        format::bfwzyx,
+    };
+
+    implementation_map<unique_gather>::add(impl_types::ocl,
+                                           shape_types::any,
+                                           typed_primitive_impl_ocl<unique_gather>::create<unique_gather_impl>,
+                                           types,
+                                           formats);
+}
+}  // namespace detail
+}  // namespace ocl
+}  // namespace cldnn
+
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::unique_count_impl)
+BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::unique_gather_impl)
--- a/src/plugins/intel_gpu/src/graph/include/unique_inst.hpp
+++ b/src/plugins/intel_gpu/src/graph/include/unique_inst.hpp
@ -0,0 +1,73 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/primitives/unique.hpp"
+#include "primitive_inst.h"
+
+namespace cldnn {
+
+template <>
+struct typed_program_node<unique_count> : typed_program_node_base<unique_count> {
+    using parent = typed_program_node_base<unique_count>;
+    using parent::parent;
+
+    program_node& input() const {
+        return get_dependency(0);
+    }
+};
+
+using unique_count_node = typed_program_node<unique_count>;
+
+template <>
+class typed_primitive_inst<unique_count> : public typed_primitive_inst_base<unique_count> {
+public:
+    using parent = typed_primitive_inst_base<unique_count>;
+    using parent::parent;
+
+    static layout calc_output_layout(const unique_count_node& node, const kernel_impl_params& impl_param);
+    template <typename ShapeType>
+    static std::vector<layout> calc_output_layouts(const unique_count_node& node, const kernel_impl_params& impl_param);
+    static std::string to_string(const unique_count_node& node);
+};
+
+using unique_count_inst = typed_primitive_inst<unique_count>;
+
+template <>
+struct typed_program_node<unique_gather> : typed_program_node_base<unique_gather> {
+    using parent = typed_program_node_base<unique_gather>;
+    using parent::parent;
+
+    program_node& input() const {
+        return get_dependency(0);
+    }
+
+    bool generates_dynamic_output() const override {
+        return true;
+    }
+
+    std::vector<size_t> get_shape_infer_dependencies() const override {
+        return {1};
+    }
+};
+
+using unique_gather_node = typed_program_node<unique_gather>;
+
+template <>
+class typed_primitive_inst<unique_gather> : public typed_primitive_inst_base<unique_gather> {
+public:
+    using parent = typed_primitive_inst_base<unique_gather>;
+    using parent::parent;
+
+    static layout calc_output_layout(const unique_gather_node& node, const kernel_impl_params& impl_param);
+    template <typename ShapeType>
+    static std::vector<layout> calc_output_layouts(const unique_gather_node& node,
+                                                   const kernel_impl_params& impl_param);
+    static std::string to_string(const unique_gather_node& node);
+};
+
+using unique_gather_inst = typed_primitive_inst<unique_gather>;
+
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -65,6 +65,7 @@
 #include "strided_slice_inst.h"
 #include "loop_inst.h"
 #include "reverse_inst.h"
+#include "unique_inst.hpp"
 #include "to_string_utils.h"

 // TODO: Remove once we have interface for kernels cache
@ -1440,6 +1441,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
            prim.type() != cldnn::gather_tree::type_id() &&
            prim.type() != cldnn::experimental_detectron_detection_output::type_id() &&
            prim.type() != cldnn::convert_color::type_id() &&
+            prim.type() != cldnn::unique_count::type_id() &&
+            prim.type() != cldnn::unique_gather::type_id() &&
            prim.type() != cldnn::experimental_detectron_generate_proposals_single_image::type_id()) {
            can_use_fsv16 = false;
        }
@ -1493,6 +1496,8 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
            prim.type() != cldnn::multiclass_nms::type_id() &&
            prim.type() != cldnn::normalize::type_id() &&
            prim.type() != cldnn::deconvolution::type_id() &&
+            prim.type() != cldnn::unique_count::type_id() &&
+            prim.type() != cldnn::unique_gather::type_id() &&
            prim.type() != cldnn::experimental_detectron_generate_proposals_single_image::type_id()) {
            can_use_bs_fs_yx_bsv16_fsv16 = false;
        }
--- a/src/plugins/intel_gpu/src/graph/unique.cpp
+++ b/src/plugins/intel_gpu/src/graph/unique.cpp
@ -0,0 +1,138 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/unique.hpp"
+
+#include <sstream>
+#include <string>
+
+#include "intel_gpu/runtime/memory.hpp"
+#include "json_object.h"
+#include "primitive_type_base.h"
+#include "unique_inst.hpp"
+
+namespace cldnn {
+
+// -----------------------------------------------
+// unique_count
+// -----------------------------------------------
+GPU_DEFINE_PRIMITIVE_TYPE_ID(unique_count)
+
+layout unique_count_inst::calc_output_layout(const unique_count_node& node, const kernel_impl_params& impl_param) {
+    OPENVINO_THROW("Only calc_output_layouts should be used!");
+}
+
+template <typename ShapeType>
+std::vector<layout> unique_count_inst::calc_output_layouts(const unique_count_node& node,
+                                                           const kernel_impl_params& impl_param) {
+    return {layout{ov::PartialShape{1}, cldnn::data_types::i64, cldnn::format::bfyx}};
+}
+
+template std::vector<layout> unique_count_inst::calc_output_layouts<ov::PartialShape>(
+    const unique_count_node& node,
+    const kernel_impl_params& impl_param);
+
+std::string unique_count_inst::to_string(const unique_count_node& node) {
+    auto primitive = node.get_primitive();
+    json_composite unique_count_info;
+    unique_count_info.add("input", node.input().id());
+    if (!primitive->flattened) {
+        unique_count_info.add("axis", primitive->axis);
+    }
+
+    auto node_info = node.desc_to_json();
+    node_info->add("unique_count info", unique_count_info);
+
+    std::ostringstream primitive_description;
+    node_info->dump(primitive_description);
+    return primitive_description.str();
+}
+
+// -----------------------------------------------
+// unique_gather
+// -----------------------------------------------
+GPU_DEFINE_PRIMITIVE_TYPE_ID(unique_gather)
+
+layout unique_gather_inst::calc_output_layout(const unique_gather_node& node, const kernel_impl_params& impl_param) {
+    OPENVINO_THROW("Only calc_output_layouts should be used!");
+}
+
+template <typename ShapeType>
+std::vector<layout> unique_gather_inst::calc_output_layouts(const unique_gather_node& node,
+                                                            const kernel_impl_params& impl_param) {
+    std::vector<layout> layouts;
+    const auto desc = impl_param.typed_desc<unique_gather>();
+    const auto input_layout = impl_param.get_input_layout();
+
+    std::vector<ShapeType> output_shapes = {ShapeType(), ShapeType(), ShapeType(), ShapeType()};
+
+    if (!impl_param.memory_deps.count(1)) {
+        if (desc->flattened) {
+            output_shapes.at(0) = ov::PartialShape{ov::Dimension::dynamic()};
+        } else {
+            output_shapes.at(0) = ov::PartialShape::dynamic(input_layout.get_partial_shape().rank());
+        }
+        output_shapes.at(1) = ov::PartialShape{ov::Dimension::dynamic()};
+        output_shapes.at(2) = ov::PartialShape{ov::Dimension::dynamic()};
+        output_shapes.at(3) = ov::PartialShape{ov::Dimension::dynamic()};
+    } else {
+        const auto input_shape = input_layout.get_shape();
+        const size_t unique_count = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.get_stream()).at(0);
+        if (desc->flattened) {
+            const auto input_tensor_capacity = ov::shape_size(input_shape);
+            output_shapes.at(0) = ov::Shape{unique_count};
+            output_shapes.at(1) = ov::Shape{unique_count};
+            output_shapes.at(2) = ov::Shape{input_tensor_capacity};
+            output_shapes.at(3) = ov::Shape{unique_count};
+        } else {
+            auto output_shape = input_shape;
+            auto& new_axis_dimension = output_shape.at(desc->axis);
+            const auto old_axis_dimension = new_axis_dimension;
+            new_axis_dimension = unique_count;
+            output_shapes.at(0) = output_shape;
+            output_shapes.at(1) = ov::Shape{new_axis_dimension};
+            output_shapes.at(2) = ov::Shape{old_axis_dimension};
+            output_shapes.at(3) = ov::Shape{new_axis_dimension};
+        }
+    }
+
+    for (auto i = 0U; i < desc->num_outputs; ++i) {
+        const auto& output_shape = output_shapes.at(i);
+        const auto output_dt = desc->output_data_types.at(i).value();
+        auto output_format = format::get_default_format(output_shape.size());
+        if (i == 0) {
+            if (desc->flattened) {
+                output_format = format::adjust_to_rank(input_layout.format, output_shape.size());
+            } else {
+                output_format = input_layout.format;
+            }
+        }
+        layouts.emplace_back(output_shape, output_dt, output_format);
+    }
+
+    return layouts;
+}
+
+template std::vector<layout> unique_gather_inst::calc_output_layouts<ov::PartialShape>(
+    const unique_gather_node& node,
+    const kernel_impl_params& impl_param);
+
+std::string unique_gather_inst::to_string(const unique_gather_node& node) {
+    auto primitive = node.get_primitive();
+    json_composite unique_gather_info;
+    unique_gather_info.add("input", node.input().id());
+    if (!primitive->flattened) {
+        unique_gather_info.add("axis", primitive->axis);
+    }
+    unique_gather_info.add("sorted", primitive->sorted);
+
+    auto node_info = node.desc_to_json();
+    node_info->add("unique_gather info", unique_gather_info);
+
+    std::ostringstream primitive_description;
+    node_info->dump(primitive_description);
+    return primitive_description.str();
+}
+
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/unique_count_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/unique_count_ref.cl
@ -0,0 +1,65 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef FLATTENED
+#    define LENGTH TOTAL_DATA_SIZE
+#else
+#    define LENGTH AXIS_LENGTH
+#endif
+
+#ifndef FLATTENED
+inline bool FUNC(slices_are_equal)(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* out_unique_elements,
+                                   uint lhs,
+                                   const __global INPUT0_TYPE* input,
+                                   uint rhs) {
+    ITERATE(if (out_unique_elements[GET_INDEX(INPUT0, lhs)] != input[GET_INDEX(INPUT0, rhs)]) { return false; })
+    return true;
+}
+
+inline void FUNC(assign_slice)(OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* out_unique_elements,
+                               uint lhs,
+                               const __global INPUT0_TYPE* input,
+                               uint rhs) {
+    ITERATE(out_unique_elements[GET_INDEX(INPUT0, lhs)] = input[GET_INDEX(INPUT0, rhs)];)
+}
+#endif
+
+// Works on unsorted data, but has worse complexity
+inline uint FUNC(unique)(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input,
+                         __global INPUT0_TYPE* out_unique_elements,
+                         uint first,
+                         const uint last) {
+    uint unique_length = 0;
+    for (; first != last; ++first) {
+        bool unique = true;
+        for (uint unique_idx = 0; unique_idx < unique_length; ++unique_idx) {
+#ifdef FLATTENED
+            if (out_unique_elements[unique_idx] == input[GET_INDEX(INPUT0, first)]) {
+#else
+            if (FUNC_CALL(slices_are_equal)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, unique_idx, input, first)) {
+#endif
+                unique = false;
+                break;
+            }
+        }
+        if (unique) {
+#ifdef FLATTENED
+            out_unique_elements[unique_length] = input[GET_INDEX(INPUT0, first)];
+#else
+            FUNC_CALL(assign_slice)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, unique_length, input, first);
+#endif
+            ++unique_length;
+        }
+    }
+    return unique_length;
+}
+
+KERNEL(unique_count_ref)
+(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input,
+ __global OUTPUT_TYPE* out_total_count,
+ __global INPUT0_TYPE* out_unique_elements) {
+    out_total_count[0] = FUNC_CALL(unique)(OPTIONAL_SHAPE_INFO_TENSOR input, out_unique_elements, 0, LENGTH);
+}
+
+#undef LENGTH
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/unique_gather_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/unique_gather_ref.cl
@ -0,0 +1,181 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef FLATTENED
+#    define LENGTH TOTAL_DATA_SIZE
+#else
+#    define LENGTH AXIS_LENGTH
+#endif
+
+inline void FUNC(swap_out_unique_elements)(__global OUTPUT_TYPE* a, __global OUTPUT_TYPE* b) {
+    const OUTPUT_TYPE temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+inline void FUNC(swap_out_indices)(__global OUTPUT1_TYPE* a, __global OUTPUT1_TYPE* b) {
+    const OUTPUT1_TYPE temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+inline void FUNC(swap_out_counts)(__global OUTPUT3_TYPE* a, __global OUTPUT3_TYPE* b) {
+    const OUTPUT3_TYPE temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+#ifndef FLATTENED
+inline bool FUNC(compare_slices_ascending)(OPTIONAL_SHAPE_INFO_ARG const __global OUTPUT_TYPE* out_unique_elements,
+                                           uint lhs,
+                                           uint rhs) {
+    ITERATE(
+        if (out_unique_elements[GET_INDEX(OUTPUT, lhs)] > out_unique_elements[GET_INDEX(OUTPUT, rhs)]) {
+            return true;
+        } else if (out_unique_elements[GET_INDEX(OUTPUT, lhs)] < out_unique_elements[GET_INDEX(OUTPUT, rhs)]) {
+            return false;
+        } else { continue; })
+    return false;
+}
+
+inline void FUNC(swap_slices)(OPTIONAL_SHAPE_INFO_ARG __global OUTPUT_TYPE* out_unique_elements, uint lhs, uint rhs) {
+    ITERATE(FUNC_CALL(swap_out_unique_elements)(&out_unique_elements[GET_INDEX(OUTPUT, lhs)],
+                                                &out_unique_elements[GET_INDEX(OUTPUT, rhs)]);)
+}
+
+inline bool FUNC(slices_are_equal)(OPTIONAL_SHAPE_INFO_ARG const __global OUTPUT_TYPE* out_unique_elements,
+                                   uint lhs,
+                                   const __global INPUT0_TYPE* input,
+                                   uint rhs) {
+    ITERATE(if (out_unique_elements[GET_INDEX(OUTPUT, lhs)] != input[GET_INDEX(INPUT0, rhs)]) { return false; })
+    return true;
+}
+
+inline void FUNC(assign_slice)(OPTIONAL_SHAPE_INFO_ARG __global OUTPUT_TYPE* out_unique_elements,
+                               uint lhs,
+                               const __global INPUT0_TYPE* input,
+                               uint rhs) {
+    ITERATE(out_unique_elements[GET_INDEX(OUTPUT, lhs)] = input[GET_INDEX(INPUT0, rhs)];)
+}
+#endif
+
+// We use bubble sort here, because we need stable sort
+// TODO: Change to better stable sort algorithm
+inline void FUNC(bubbleSort)(OPTIONAL_SHAPE_INFO_ARG __global OUTPUT_TYPE* out_unique_elements,
+                             __global OUTPUT1_TYPE* out_indices,
+                             __global OUTPUT3_TYPE* out_counts,
+                             int l,
+                             int h) {
+    for (int i = 0; i < h - l; ++i) {
+        bool swapped = false;
+        for (int j = l; j < h - i; ++j) {
+#ifdef FLATTENED
+            int j1 = j + 1;
+            if ((out_unique_elements[GET_INDEX(OUTPUT, j)] > out_unique_elements[GET_INDEX(OUTPUT, j1)])) {
+                FUNC_CALL(swap_out_unique_elements)
+                (&out_unique_elements[GET_INDEX(OUTPUT, j)], &out_unique_elements[GET_INDEX(OUTPUT, j1)]);
+#else
+            if (FUNC_CALL(compare_slices_ascending)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, j, j + 1)) {
+                FUNC_CALL(swap_slices)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, j, j + 1);
+#endif
+                FUNC_CALL(swap_out_indices)(&out_indices[j], &out_indices[j + 1]);
+                FUNC_CALL(swap_out_counts)(&out_counts[j], &out_counts[j + 1]);
+                swapped = true;
+            }
+        }
+        if (!swapped) {
+            break;
+        }
+    }
+}
+
+// Works on unsorted data, but has worse complexity
+inline uint FUNC(unique)(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input,
+                         __global OUTPUT_TYPE* out_unique_elements,
+                         __global OUTPUT1_TYPE* out_indices,
+                         __global OUTPUT2_TYPE* out_rev_indices,
+                         __global OUTPUT3_TYPE* out_counts,
+                         uint first,
+                         const uint last) {
+    uint unique_length = 0;
+    for (; first != last; ++first) {
+        bool unique = true;
+        for (uint unique_idx = 0; unique_idx < unique_length; ++unique_idx) {
+#ifdef FLATTENED
+            if (out_unique_elements[GET_INDEX(OUTPUT, unique_idx)] == input[GET_INDEX(INPUT0, first)]) {
+#else
+            if (FUNC_CALL(slices_are_equal)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, unique_idx, input, first)) {
+#endif
+                unique = false;
+                out_rev_indices[first] = unique_idx;
+                ++out_counts[unique_idx];
+                break;
+            }
+        }
+        if (unique) {
+#ifdef FLATTENED
+            out_unique_elements[GET_INDEX(OUTPUT, unique_length)] = input[GET_INDEX(INPUT0, first)];
+#else
+            FUNC_CALL(assign_slice)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, unique_length, input, first);
+#endif
+            out_indices[unique_length] = first;
+            out_rev_indices[first] = unique_length;
+            ++out_counts[unique_length];
+            ++unique_length;
+        }
+    }
+    return unique_length;
+}
+
+inline uint FUNC(fill_out_rev_indices)(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input,
+                                       __global OUTPUT_TYPE* out_unique_elements,
+                                       __global OUTPUT2_TYPE* out_rev_indices,
+                                       const uint end) {
+    for (uint i = 0; i < LENGTH; ++i) {
+        for (uint j = 0; j < end; ++j) {
+#ifdef FLATTENED
+            if (out_unique_elements[GET_INDEX(OUTPUT, j)] == input[GET_INDEX(INPUT0, i)]) {
+#else
+            if (FUNC_CALL(slices_are_equal)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, j, input, i)) {
+#endif
+                out_rev_indices[i] = j;
+                break;
+            }
+        }
+    }
+}
+
+KERNEL(unique_gather_ref)
+(OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input,
+ const __global INPUT1_TYPE* out_total_count,
+ __global OUTPUT_TYPE* out_unique_elements,
+ __global OUTPUT1_TYPE* out_indices,
+ __global OUTPUT2_TYPE* out_rev_indices,
+ __global OUTPUT3_TYPE* out_counts) {
+    // TODO: Think of better approach to initialize with zeros
+    for (uint i = 0; i < LENGTH; ++i) {
+        out_counts[i] = 0;
+    }
+    // Run unique algorithm
+    const uint end = FUNC_CALL(unique)(OPTIONAL_SHAPE_INFO_TENSOR input,
+                                       out_unique_elements,
+                                       out_indices,
+                                       out_rev_indices,
+                                       out_counts,
+                                       0,
+                                       LENGTH);
+#ifdef SORTED
+    // Sort out data
+    FUNC_CALL(bubbleSort)(OPTIONAL_SHAPE_INFO_TENSOR out_unique_elements, out_indices, out_counts, 0, end - 1);
+    // After sorting all out_unique_elements will shuffle and out_rev_indices should change not only order, but their
+    // values (indexes).
+    // So, we need to fill them again...
+    // Another approach would be to allocate whole separate buffer as input, sort whole dataset first and then run
+    // deduplicate algorithm with correct filling of out_rev_indices.
+    FUNC_CALL(fill_out_rev_indices)
+    (OPTIONAL_SHAPE_INFO_TENSOR input, out_unique_elements, out_rev_indices, end);
+#endif
+}
+
+#undef LENGTH
--- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h
@ -93,7 +93,9 @@ enum class KernelType {
    PRIOR_BOX,
    EYE,
    GENERATE_PROPOSALS,
-    MULTICLASS_NMS
+    MULTICLASS_NMS,
+    UNIQUE_COUNT,
+    UNIQUE_GATHER,
 };

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.cpp
@ -0,0 +1,343 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unique_kernel_ref.hpp"
+
+#include "kernel_selector_utils.h"
+
+namespace kernel_selector {
+
+namespace {
+
+JitConstants MakeAxisJitConstants(size_t rank, int64_t axis, const std::string& prefix_for_iterate) {
+    const std::map<char, std::string> dimensions_sizes_map = {
+        {'b', "_BATCH_NUM"},
+        {'f', "_FEATURE_NUM"},
+        {'w', "_SIZE_W"},
+        {'z', "_SIZE_Z"},
+        {'y', "_SIZE_Y"},
+        {'x', "_SIZE_X"},
+    };
+
+    auto dimensions = [rank]() -> std::vector<char> {
+        switch (rank) {
+        case 4:
+            return {'b', 'f', 'y', 'x'};
+        case 5:
+            return {'b', 'f', 'z', 'y', 'x'};
+        case 6:
+            return {'b', 'f', 'w', 'z', 'y', 'x'};
+        }
+        throw std::invalid_argument("Unsupported input rank for unique primitive");
+    }();
+    auto& axis_dimension = dimensions.at(axis);
+
+    const auto axis_length_name = "AXIS_LENGTH";
+    const auto axis_length_val = "INPUT0" + dimensions_sizes_map.at(axis_dimension);
+
+    // Mark axis dimension as 'i' for indexing
+    axis_dimension = 'i';
+
+    const auto get_index_name = "GET_INDEX(prefix, i)";
+    const auto get_index_val = [&dimensions]() {
+        std::string str = "CAT(prefix, _GET_INDEX)";
+        str += '(';
+        for (auto ch : dimensions) {
+            str += ch;
+            str += ',';
+        }
+        str.back() = ')';
+        return str;
+    }();
+
+    const auto iterate_name = "ITERATE(body)";
+    const auto iterate_val = [&dimensions, &dimensions_sizes_map, &prefix_for_iterate]() {
+        std::stringstream ss;
+        for (auto ch : dimensions) {
+            // No need to iterate through axis index
+            if (ch == 'i') {
+                continue;
+            }
+            const auto size = prefix_for_iterate + dimensions_sizes_map.at(ch);
+            ss << "for (uint " << ch << " = 0; " << ch << " < " << size << "; ++" << ch << ") {";
+        }
+        ss << "body";
+        // Note size - 1 here as we don't iterate through axis index
+        for (auto i = 0U; i < dimensions.size() - 1; ++i) {
+            ss << '}';
+        }
+        return ss.str();
+    }();
+
+    return {MakeJitConstant(axis_length_name, axis_length_val),
+            MakeJitConstant(get_index_name, get_index_val),
+            MakeJitConstant(iterate_name, iterate_val)};
+}
+
+JitConstants MakeFlattenedJitConstants(size_t rank, bool simple_layout) {
+    const auto get_index_name = "GET_INDEX(prefix, i)";
+
+    if (simple_layout) {
+        const auto get_index_val = "i";
+        return {MakeJitConstant("FLATTENED", true), MakeJitConstant(get_index_name, get_index_val)};
+    }
+
+    const auto dimensions = [rank]() -> std::vector<std::string> {
+        switch (rank) {
+        case 4:
+            return {"i / (prefix##_SIZE_X * prefix##_SIZE_Y * prefix##_FEATURE_NUM)",
+                    "i / (prefix##_SIZE_X * prefix##_SIZE_Y) % prefix##_FEATURE_NUM",
+                    "i / prefix##_SIZE_X % prefix##_SIZE_Y",
+                    "i % prefix##_SIZE_X"};
+        case 5:
+            return {"i / (prefix##_SIZE_X * prefix##_SIZE_Y * prefix##_SIZE_Z * prefix##_FEATURE_NUM)",
+                    "i / (prefix##_SIZE_X * prefix##_SIZE_Y * prefix##_SIZE_Z) % prefix##_FEATURE_NUM",
+                    "i / (prefix##_SIZE_X * prefix##_SIZE_Y) % prefix##_SIZE_Z",
+                    "i / prefix##_SIZE_X % prefix##_SIZE_Y",
+                    "i % prefix##_SIZE_X"};
+        case 6:
+            return {
+                "i / (prefix##_SIZE_X * prefix##_SIZE_Y * prefix##_SIZE_Z * prefix##_SIZE_W * prefix##_FEATURE_NUM)",
+                "i / (prefix##_SIZE_X * prefix##_SIZE_Y * prefix##_SIZE_Z * prefix##_SIZE_W) % prefix##_FEATURE_NUM",
+                "i / (prefix##_SIZE_X * prefix##_SIZE_Y * prefix##_SIZE_Z) % prefix##_SIZE_W",
+                "i / (prefix##_SIZE_X * prefix##_SIZE_Y) % prefix##_SIZE_Z",
+                "i / prefix##_SIZE_X % prefix##_SIZE_Y",
+                "i % prefix##_SIZE_X"};
+        }
+        throw std::invalid_argument("Unsupported rank for unique primitive");
+    }();
+
+    const auto get_index_val = [&dimensions]() {
+        std::string str = "CAT(prefix, _GET_INDEX)";
+        str += '(';
+        for (const auto& dimension : dimensions) {
+            str += dimension;
+            str += ',';
+        }
+        str.back() = ')';
+        return str;
+    }();
+
+    return {MakeJitConstant("FLATTENED", true), MakeJitConstant(get_index_name, get_index_val)};
+}
+
+}  // namespace
+
+KernelsData UniqueCountKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    auto kernel_data = KernelData::Default<unique_count_params>(params);
+    const auto& kernel_params = dynamic_cast<const unique_count_params&>(*kernel_data.params);
+    const auto dispatch_data = SetDefault(kernel_params);
+    const auto entry_point = GetEntryPoint(kernelName, kernel_params.layerID, params, options);
+    const auto jit_constants = GetJitConstants(kernel_params);
+    const auto jit = CreateJit(kernelName, jit_constants, entry_point);
+    auto& kernel = kernel_data.kernels.front();
+
+    kernel_data.update_dispatch_data_func = [](const Params& params, KernelData& kd) {
+        const auto& prim_params = dynamic_cast<const unique_count_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+        OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
+        kd.kernels[0].params.workGroups.global = dispatchData.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.lws;
+        kd.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params);
+        // Need to adjust buffer size according to input size
+        kd.internalBufferSizes.front() = prim_params.inputs.front().PhysicalSizeInBytes();
+        kd.internalBufferDataType = prim_params.inputs.front().GetDType();
+    };
+
+    FillCLKernelData(kernel,
+                     dispatch_data,
+                     params.engineInfo,
+                     kernelName,
+                     jit,
+                     entry_point,
+                     {},
+                     false,
+                     false,
+                     static_cast<int>(kernel_params.inputs.size()),
+                     GetFusedPrimitiveInputsCount(kernel_params),
+                     static_cast<int>(kernel_params.outputs.size()),
+                     kernel_params.inputs.front().is_dynamic());
+
+    // Additional buffer to save intermediate algorithm results
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
+    kernel_data.internalBufferSizes.push_back(kernel_params.inputs.front().PhysicalSizeInBytes());
+    kernel_data.internalBufferDataType = kernel_params.inputs.front().GetDType();
+
+    return {kernel_data};
+}
+
+ParamsKey UniqueCountKernelRef::GetSupportedKey() const {
+    ParamsKey key;
+    key.EnableAllInputDataType();
+    key.EnableAllOutputDataType();
+    key.EnableDifferentTypes();
+    key.EnableAllInputLayout();
+    key.EnableAllOutputLayout();
+    key.EnableTensorOffset();
+    key.EnableTensorPitches();
+    key.EnableBatching();
+    key.EnableDynamicShapesSupport();
+    return key;
+}
+
+bool UniqueCountKernelRef::Validate(const Params& params, const optional_params& options) const {
+    if (params.GetType() != KernelType::UNIQUE_COUNT || options.GetType() != KernelType::UNIQUE_COUNT) {
+        return false;
+    }
+
+    const auto& kernel_params = dynamic_cast<const unique_count_params&>(params);
+    if (kernel_params.inputs.size() != 1) {
+        return false;
+    }
+    if (kernel_params.outputs.size() != 1) {
+        return false;
+    }
+
+    return true;
+}
+
+JitConstants UniqueCountKernelRef::GetJitConstants(const unique_count_params& kernel_params) const {
+    const auto input = kernel_params.inputs.front();
+    auto jit_constants = MakeBaseParamsJitConstants(kernel_params);
+
+    if (kernel_params.flattened) {
+        jit_constants.Merge(MakeFlattenedJitConstants(input.Dimentions(), input.SimpleLayout()));
+    } else {
+        jit_constants.Merge(MakeAxisJitConstants(input.Dimentions(), kernel_params.axis, "INPUT0"));
+    }
+
+    if (input.is_dynamic()) {
+        DimensionAccessHelper dims(input);
+        const std::string total_data_size =
+            toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()});
+        jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size));
+    } else {
+        jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", input.LogicalSize()));
+    }
+
+    return jit_constants;
+}
+
+CommonDispatchData UniqueCountKernelRef::SetDefault(const unique_count_params& /* kernel_params */) {
+    CommonDispatchData dispatch_data;
+
+    // For now we run only in one thread
+    // TODO: Parallelize
+    dispatch_data.gws = {1, 1, 1};
+    dispatch_data.lws = {1, 1, 1};
+
+    return dispatch_data;
+}
+
+KernelsData UniqueGatherKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    auto kernel_data = KernelData::Default<unique_gather_params>(params);
+    const auto& kernel_params = dynamic_cast<const unique_gather_params&>(*kernel_data.params);
+    const auto dispatch_data = SetDefault(kernel_params);
+    const auto entry_point = GetEntryPoint(kernelName, kernel_params.layerID, params, options);
+    const auto jit_constants = GetJitConstants(kernel_params);
+    const auto jit = CreateJit(kernelName, jit_constants, entry_point);
+    auto& kernel = kernel_data.kernels.front();
+
+    kernel_data.update_dispatch_data_func = [](const Params& params, KernelData& kd) {
+        const auto& prim_params = dynamic_cast<const unique_gather_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+        OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
+        kd.kernels[0].params.workGroups.global = dispatchData.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.lws;
+        kd.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params);
+    };
+
+    FillCLKernelData(kernel,
+                     dispatch_data,
+                     params.engineInfo,
+                     kernelName,
+                     jit,
+                     entry_point,
+                     {},
+                     false,
+                     false,
+                     static_cast<int>(kernel_params.inputs.size()),
+                     GetFusedPrimitiveInputsCount(kernel_params),
+                     static_cast<int>(kernel_params.outputs.size()),
+                     kernel_params.outputs.front().is_dynamic());
+
+    return {kernel_data};
+}
+
+ParamsKey UniqueGatherKernelRef::GetSupportedKey() const {
+    ParamsKey key;
+    key.EnableAllInputDataType();
+    key.EnableAllOutputDataType();
+    key.EnableDifferentTypes();
+    key.EnableAllInputLayout();
+    key.EnableAllOutputLayout();
+    key.EnableTensorOffset();
+    key.EnableTensorPitches();
+    key.EnableBatching();
+    key.EnableDynamicShapesSupport();
+    return key;
+}
+
+bool UniqueGatherKernelRef::Validate(const Params& params, const optional_params& options) const {
+    if (params.GetType() != KernelType::UNIQUE_GATHER || options.GetType() != KernelType::UNIQUE_GATHER) {
+        return false;
+    }
+
+    const auto& kernel_params = dynamic_cast<const unique_gather_params&>(params);
+    if (kernel_params.inputs.size() != 2) {
+        return false;
+    }
+    if (kernel_params.outputs.size() != 4) {
+        return false;
+    }
+
+    return true;
+}
+
+JitConstants UniqueGatherKernelRef::GetJitConstants(const unique_gather_params& kernel_params) const {
+    const auto input = kernel_params.inputs.front();
+    auto jit_constants = MakeBaseParamsJitConstants(kernel_params);
+
+    if (kernel_params.sorted) {
+        jit_constants.AddConstant(MakeJitConstant("SORTED", true));
+    }
+
+    if (kernel_params.flattened) {
+        jit_constants.Merge(MakeFlattenedJitConstants(input.Dimentions(), input.SimpleLayout()));
+    } else {
+        jit_constants.Merge(MakeAxisJitConstants(input.Dimentions(), kernel_params.axis, "OUTPUT"));
+    }
+
+    if (input.is_dynamic()) {
+        DimensionAccessHelper dims(input);
+        const std::string total_data_size =
+            toVectorMulString({dims.x(), dims.y(), dims.z(), dims.w(), dims.f(), dims.b()});
+        jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", total_data_size));
+    } else {
+        jit_constants.AddConstant(MakeJitConstant("TOTAL_DATA_SIZE", input.LogicalSize()));
+    }
+
+    return jit_constants;
+}
+
+CommonDispatchData UniqueGatherKernelRef::SetDefault(const unique_gather_params& /* kernel_params */) {
+    CommonDispatchData dispatch_data;
+
+    // For now we run only in one thread
+    // TODO: Parallelize
+    dispatch_data.gws = {1, 1, 1};
+    dispatch_data.lws = {1, 1, 1};
+
+    return dispatch_data;
+}
+
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.hpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_ref.hpp
@ -0,0 +1,74 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+
+namespace kernel_selector {
+
+/**
+ * UniqueCount reference kernel parameters.
+ */
+struct unique_count_params : base_params {
+    unique_count_params() : base_params(KernelType::UNIQUE_COUNT) {}
+    bool flattened{};
+    int64_t axis{};
+};
+
+/**
+ * UniqueCount reference kernel optional parameters.
+ */
+struct unique_count_optional_params : optional_params {
+    unique_count_optional_params() : optional_params(KernelType::UNIQUE_COUNT) {}
+};
+
+/**
+ * Reference kernel for UniqueCount.
+ */
+class UniqueCountKernelRef : public KernelBaseOpenCL {
+public:
+    UniqueCountKernelRef() : KernelBaseOpenCL{"unique_count_ref"} {}
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const unique_count_params& kernel_params) const;
+    static CommonDispatchData SetDefault(const unique_count_params& kernel_params);
+};
+
+/**
+ * UniqueGather reference kernel parameters.
+ */
+struct unique_gather_params : base_params {
+    unique_gather_params() : base_params(KernelType::UNIQUE_GATHER) {}
+    bool flattened{};
+    int64_t axis{};
+    bool sorted{};
+};
+
+/**
+ * UniqueGather reference kernel optional parameters.
+ */
+struct unique_gather_optional_params : optional_params {
+    unique_gather_optional_params() : optional_params(KernelType::UNIQUE_GATHER) {}
+};
+
+/**
+ * Reference kernel for UniqueGather.
+ */
+class UniqueGatherKernelRef : public KernelBaseOpenCL {
+public:
+    UniqueGatherKernelRef() : KernelBaseOpenCL{"unique_gather_ref"} {}
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const unique_gather_params& kernel_params) const;
+    static CommonDispatchData SetDefault(const unique_gather_params& kernel_params);
+};
+
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_selector.cpp
@ -0,0 +1,37 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "unique_kernel_selector.hpp"
+
+#include "unique_kernel_ref.hpp"
+
+namespace kernel_selector {
+
+unique_count_kernel_selector::unique_count_kernel_selector() {
+    Attach<UniqueCountKernelRef>();
+}
+
+KernelsData unique_count_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::UNIQUE_COUNT);
+}
+
+unique_count_kernel_selector& unique_count_kernel_selector::Instance() {
+    static unique_count_kernel_selector instance;
+    return instance;
+}
+
+unique_gather_kernel_selector::unique_gather_kernel_selector() {
+    Attach<UniqueGatherKernelRef>();
+}
+
+KernelsData unique_gather_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::UNIQUE_GATHER);
+}
+
+unique_gather_kernel_selector& unique_gather_kernel_selector::Instance() {
+    static unique_gather_kernel_selector instance;
+    return instance;
+}
+
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_selector.hpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/unique/unique_kernel_selector.hpp
@ -0,0 +1,25 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+
+class unique_count_kernel_selector : public kernel_selector_base {
+public:
+    unique_count_kernel_selector();
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+    static unique_count_kernel_selector& Instance();
+};
+
+class unique_gather_kernel_selector : public kernel_selector_base {
+public:
+    unique_gather_kernel_selector();
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+    static unique_gather_kernel_selector& Instance();
+};
+
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/plugin/ops/unique.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/unique.cpp
@ -0,0 +1,54 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/primitives/unique.hpp"
+
+#include "intel_gpu/plugin/program.hpp"
+#include "ngraph/op/unique.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+namespace {
+
+void CreateUniqueOp(Program& p, const std::shared_ptr<ngraph::op::v10::Unique>& op) {
+    validate_inputs_count(op, {1, 2});
+
+    bool flattened = true;
+    int64_t axis{};
+    if (op->get_input_size() == 2) {
+        auto axis_constant = std::dynamic_pointer_cast<ngraph::op::Constant>(op->get_input_node_shared_ptr(1));
+        if (!axis_constant) {
+            IE_THROW() << "Unsupported parameter nodes type in " << op->get_friendly_name() << " ("
+                       << op->get_type_name() << ")";
+        }
+        axis = axis_constant->cast_vector<int64_t>().at(0);
+        axis = ov::normalize_axis(op.get(), axis, op->get_input_partial_shape(0).rank());
+        flattened = false;
+    }
+
+    const auto input = p.GetInputInfo(op).front();
+    const auto layer_name = layer_type_name_ID(op);
+    const auto count_prim_id = layer_name + "_count";
+
+    const cldnn::unique_count unique_count_prim(count_prim_id, input, flattened, axis);
+    p.add_primitive(*op, unique_count_prim);
+
+    const cldnn::unique_gather unique_gather_prim(layer_name,
+                                                  {input, count_prim_id},
+                                                  flattened,
+                                                  axis,
+                                                  op->get_sorted(),
+                                                  cldnn::element_type_to_data_type(op->get_input_element_type(0)),
+                                                  cldnn::element_type_to_data_type(op->get_index_element_type()),
+                                                  cldnn::element_type_to_data_type(op->get_count_element_type()));
+    p.add_primitive(*op, unique_gather_prim);
+}
+
+}  // namespace
+
+REGISTER_FACTORY_IMPL(v10, Unique);
+
+}  // namespace intel_gpu
+}  // namespace ov
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@ -126,5 +126,8 @@ std::vector<std::string> disabledTestPatterns() {
            R"(.*smoke_GroupDeconv_2D_Dynamic_.*FP32/GroupDeconvolutionLayerGPUTest.CompareWithRefs.*)",
            // Issue: 111440
            R"(.*smoke_set1/GatherElementsGPUTest.CompareWithRefs.*)",
+            // For some strange reason (bug?) output format cannot have a rank greater than 4 for dynamic shape case,
+            // because it crashes in some random places during "reorder_inputs" pass.
+            R"(.*UniqueLayerDynamicGPUTest.*\(\d*\.\d*\.\d*\.\d*\.\d*\).*axis.*)",
    };
 }
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/unique.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/unique.cpp
@ -0,0 +1,170 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "shared_test_classes/base/layer_test_utils.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+using namespace ov::test;
+
+namespace GPULayerTestsDefinitions {
+
+typedef std::tuple<std::vector<InputShape>,  // Input shapes
+                   std::tuple<bool, int>,    // Is flattened and axis
+                   bool,                     // Sorted
+                   ElementType               // Data precision
+                   >
+    UniqueDynamicGPUTestParams;
+
+class UniqueLayerDynamicGPUTest : public testing::WithParamInterface<UniqueDynamicGPUTestParams>,
+                                  virtual public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<UniqueDynamicGPUTestParams>& obj) {
+        std::vector<InputShape> inputShapes;
+        std::tuple<bool, int> flatOrAxis;
+        bool sorted;
+        ElementType dataPrecision;
+        std::tie(inputShapes, flatOrAxis, sorted, dataPrecision) = obj.param;
+
+        std::ostringstream result;
+        result << "IS=(";
+        for (size_t i = 0lu; i < inputShapes.size(); i++) {
+            result << CommonTestUtils::partialShape2str({inputShapes[i].first})
+                   << (i < inputShapes.size() - 1lu ? "_" : "");
+        }
+        result << ")_TS=";
+        for (size_t i = 0lu; i < inputShapes.front().second.size(); i++) {
+            result << "{";
+            for (size_t j = 0lu; j < inputShapes.size(); j++) {
+                result << CommonTestUtils::vec2str(inputShapes[j].second[i])
+                       << (j < inputShapes.size() - 1lu ? "_" : "");
+            }
+            result << "}_";
+        }
+
+        if (!std::get<0>(flatOrAxis)) {
+            result << "axis=" << std::get<1>(flatOrAxis) << "_";
+        } else {
+            result << "flattened"
+                   << "_";
+        }
+        result << "sorted=" << (sorted ? "True" : "False") << "_";
+        result << "dataPrc=" << dataPrecision;
+
+        return result.str();
+    }
+
+protected:
+    void SetUp() override {
+        std::vector<InputShape> inputShapes;
+        std::tuple<bool, int> flatOrAxis;
+        bool sorted, flattened;
+        int axis;
+        ElementType dataPrecision;
+
+        std::tie(inputShapes, flatOrAxis, sorted, dataPrecision) = this->GetParam();
+        targetDevice = CommonTestUtils::DEVICE_GPU;
+        init_input_shapes(inputShapes);
+        flattened = std::get<0>(flatOrAxis);
+
+        auto params = ngraph::builder::makeDynamicParams(dataPrecision, inputDynamicShapes);
+        params[0]->set_friendly_name("data");
+        auto paramOuts =
+            ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ov::op::v0::Parameter>(params));
+        std::shared_ptr<ov::Node> uniqueNode;
+        if (flattened) {
+            uniqueNode = std::make_shared<ov::op::v10::Unique>(paramOuts[0], sorted);
+        } else {
+            axis = std::get<1>(flatOrAxis);
+            uniqueNode = std::make_shared<ov::op::v10::Unique>(
+                paramOuts[0],
+                ov::op::v0::Constant::create(ov::element::i64, ov::Shape({1}), {axis}),
+                sorted);
+        }
+
+        // Need to create results for all outputs
+        ngraph::ResultVector results;
+        for (auto i = 0U; i < uniqueNode->get_output_size(); ++i) {
+            results.push_back(std::make_shared<ngraph::opset1::Result>(uniqueNode->output(i)));
+        }
+
+        function = std::make_shared<ngraph::Function>(results, params, "Unique");
+    }
+
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override {
+        inputs.clear();
+        const auto& funcInputs = function->inputs();
+
+        for (size_t i = 0; i < funcInputs.size(); ++i) {
+            const auto& funcInput = funcInputs[i];
+            ov::Tensor tensor;
+
+            if (funcInput.get_node()->get_friendly_name() == "data") {
+                int32_t range = std::accumulate(targetInputStaticShapes[0].begin(),
+                                                targetInputStaticShapes[0].end(),
+                                                1,
+                                                std::multiplies<size_t>());
+                tensor = utils::create_and_fill_tensor(funcInput.get_element_type(),
+                                                       targetInputStaticShapes[0],
+                                                       range,
+                                                       -range / 2,
+                                                       1);
+            }
+            inputs.insert({funcInput.get_node_shared_ptr(), tensor});
+        }
+    }
+};
+
+TEST_P(UniqueLayerDynamicGPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    run();
+}
+
+namespace {
+
+const std::vector<ElementType> dataPrecision = {
+    ElementType::f16,
+    ElementType::i32,
+};
+
+std::vector<std::tuple<bool, int>> flatOrAxis{{true, 0}, {false, 0}, {false, 1}, {false, -1}};
+
+std::vector<bool> sorted{true, false};
+
+std::vector<std::vector<InputShape>> getStaticShapes() {
+    return {
+        {{{}, {{7, 2, 3}}}},
+        {{{}, {{7, 2, 3, 5}}}},
+        {{{}, {{7, 2, 3, 5, 4}}}},
+    };
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke_static,
+                         UniqueLayerDynamicGPUTest,
+                         ::testing::Combine(::testing::ValuesIn(getStaticShapes()),
+                                            ::testing::ValuesIn(flatOrAxis),
+                                            ::testing::ValuesIn(sorted),
+                                            ::testing::ValuesIn(dataPrecision)),
+                         UniqueLayerDynamicGPUTest::getTestCaseName);
+
+std::vector<std::vector<InputShape>> getDynamicShapes() {
+    return {
+        {{{ov::Dimension(2, 15), -1, -1, -1},                                       // Dynamic shape
+          {{8, 3, 3, 3}, {6, 5, 2, 5}, {4, 7, 1, 11}, {2, 9, 3, 4}}}},              // Target shapes
+        {{{-1, -1, -1, -1, -1},                                                     // Dynamic shape
+          {{1, 2, 1, 13, 2}, {3, 4, 7, 2, 2}, {5, 6, 3, 5, 2}, {7, 8, 4, 4, 2}}}},  // Target shapes
+    };
+}
+
+INSTANTIATE_TEST_SUITE_P(smoke_dynamic,
+                         UniqueLayerDynamicGPUTest,
+                         ::testing::Combine(::testing::ValuesIn(getDynamicShapes()),
+                                            ::testing::ValuesIn(flatOrAxis),
+                                            ::testing::ValuesIn(sorted),
+                                            ::testing::ValuesIn(dataPrecision)),
+                         UniqueLayerDynamicGPUTest::getTestCaseName);
+
+}  // namespace
+}  // namespace GPULayerTestsDefinitions
--- a/src/plugins/intel_gpu/tests/unit/test_cases/unique_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/unique_gpu_test.cpp
@ -0,0 +1,394 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <test_utils/test_utils.h>
+
+#include <intel_gpu/primitives/unique.hpp>
+#include <vector>
+
+using namespace cldnn;
+using namespace tests;
+
+namespace {
+
+template <typename vecElementType>
+std::string vec2str(const std::vector<vecElementType>& vec) {
+    if (!vec.empty()) {
+        std::ostringstream result;
+        result << "(";
+        std::copy(vec.begin(), vec.end() - 1, std::ostream_iterator<vecElementType>(result, "."));
+        result << vec.back() << ")";
+        return result.str();
+    }
+    return "()";
+}
+
+template <class ElemT, class IndexT, class CountT>
+struct unique_test_inputs {
+    ov::Shape data_shape;
+    std::vector<ElemT> input_data;
+    std::vector<ElemT> expected_unique_values;
+    std::vector<IndexT> expected_indices;
+    std::vector<IndexT> expected_rev_indices;
+    std::vector<CountT> expected_counts;
+    bool flattened;
+    int64_t axis;
+    bool sorted;
+};
+
+template <class ElemT, class IndexT, class CountT>
+using unique_test_params = std::tuple<unique_test_inputs<ElemT, IndexT, CountT>, format::type>;
+
+template <class ElemT, class IndexT, class CountT>
+struct unique_gpu_test : public testing::TestWithParam<unique_test_params<ElemT, IndexT, CountT>> {
+public:
+    void test() {
+        format::type fmt;
+        unique_test_inputs<ElemT, IndexT, CountT> p;
+        std::tie(p, fmt) = testing::TestWithParam<unique_test_params<ElemT, IndexT, CountT>>::GetParam();
+
+        auto& engine = get_test_engine();
+        const auto elem_data_type = type_to_data_type<ElemT>::value;
+        const auto index_data_type = type_to_data_type<IndexT>::value;
+        const auto count_data_type = type_to_data_type<CountT>::value;
+        const auto plain_format = format::bfyx;
+
+        const layout in_layout(p.data_shape, elem_data_type, plain_format);
+        auto input = engine.allocate_memory(in_layout);
+        set_values(input, p.input_data);
+
+        topology topology;
+        topology.add(input_layout("input", input->get_layout()));
+        topology.add(reorder("reordered_input", input_info("input"), fmt, elem_data_type));
+        topology.add(unique_count("unique_count", {input_info("reordered_input")}, p.flattened, p.axis));
+        topology.add(unique_gather("unique_gather",
+                                   {input_info("reordered_input"), input_info("unique_count")},
+                                   p.flattened,
+                                   p.axis,
+                                   p.sorted,
+                                   elem_data_type,
+                                   index_data_type,
+                                   count_data_type));
+        topology.add(reorder("expected_unique_values", input_info("unique_gather", 0), plain_format, elem_data_type));
+        topology.add(reorder("expected_indices", input_info("unique_gather", 1), plain_format, index_data_type));
+        topology.add(reorder("expected_rev_indices", input_info("unique_gather", 2), plain_format, index_data_type));
+        topology.add(reorder("expected_counts", input_info("unique_gather", 3), plain_format, count_data_type));
+
+        auto config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+        network network(engine, topology, config);
+        network.set_input_data("input", input);
+
+        const auto outputs = network.execute();
+
+        const auto expected_unique_values = outputs.at("expected_unique_values").get_memory();
+        cldnn::mem_lock<ElemT> expected_unique_values_ptr(expected_unique_values, get_test_stream());
+        ASSERT_EQ(expected_unique_values_ptr.size(), p.expected_unique_values.size());
+        for (auto i = 0U; i < expected_unique_values_ptr.size(); ++i) {
+            ASSERT_EQ(expected_unique_values_ptr[i], p.expected_unique_values[i]);
+        }
+
+        const auto expected_indices = outputs.at("expected_indices").get_memory();
+        cldnn::mem_lock<IndexT> expected_indices_ptr(expected_indices, get_test_stream());
+        ASSERT_EQ(expected_indices_ptr.size(), p.expected_indices.size());
+        for (auto i = 0U; i < expected_indices_ptr.size(); ++i) {
+            ASSERT_EQ(expected_indices_ptr[i], p.expected_indices[i]);
+        }
+
+        const auto expected_rev_indices = outputs.at("expected_rev_indices").get_memory();
+        cldnn::mem_lock<IndexT> expected_rev_indices_ptr(expected_rev_indices, get_test_stream());
+        ASSERT_EQ(expected_rev_indices_ptr.size(), p.expected_rev_indices.size());
+        for (auto i = 0U; i < expected_rev_indices_ptr.size(); ++i) {
+            ASSERT_EQ(expected_rev_indices_ptr[i], p.expected_rev_indices[i]);
+        }
+
+        const auto expected_counts = outputs.at("expected_counts").get_memory();
+        cldnn::mem_lock<CountT> expected_counts_ptr(expected_counts, get_test_stream());
+        ASSERT_EQ(expected_counts_ptr.size(), p.expected_counts.size());
+        for (auto i = 0U; i < expected_counts_ptr.size(); ++i) {
+            ASSERT_EQ(expected_counts_ptr[i], p.expected_counts[i]);
+        }
+    }
+
+    static std::string PrintToStringParamName(
+        const testing::TestParamInfo<unique_test_params<ElemT, IndexT, CountT>>& info) {
+        format::type fmt;
+        unique_test_inputs<ElemT, IndexT, CountT> p;
+        std::tie(p, fmt) = info.param;
+
+        std::ostringstream result;
+        result << "data_shape=" << vec2str(p.data_shape) << "; ";
+        result << "input_data=" << vec2str(p.input_data) << "; ";
+        result << "data_type=" << type_to_data_type<ElemT>::value << "; ";
+        result << "index_type=" << type_to_data_type<IndexT>::value << "; ";
+        result << "counts_type=" << type_to_data_type<CountT>::value << "; ";
+        result << "sorted=" << p.sorted << "; ";
+        if (!p.flattened) {
+            result << "axis=" << p.axis << "; ";
+        }
+        result << "fmt=" << fmt_to_str(fmt) << "; ";
+        return result.str();
+    }
+};
+
+template <class ElemT, class IndexT, class CountT>
+std::vector<unique_test_inputs<ElemT, IndexT, CountT>> getUniqueParams() {
+    return {
+        {
+            ov::Shape{5},
+            std::vector<ElemT>{5, 4, 3, 2, 1},
+            std::vector<ElemT>{5, 4, 3, 2, 1},
+            std::vector<IndexT>{0, 1, 2, 3, 4},
+            std::vector<IndexT>{0, 1, 2, 3, 4},
+            std::vector<CountT>{1, 1, 1, 1, 1},
+            true,
+            0,
+            false,
+        },
+        {
+            ov::Shape{5},
+            std::vector<ElemT>{5, 4, 3, 2, 1},
+            std::vector<ElemT>{1, 2, 3, 4, 5},
+            std::vector<IndexT>{4, 3, 2, 1, 0},
+            std::vector<IndexT>{4, 3, 2, 1, 0},
+            std::vector<CountT>{1, 1, 1, 1, 1},
+            true,
+            0,
+            true,
+        },
+        {
+            ov::Shape{7},
+            std::vector<ElemT>{1, 3, 5, 3, 2, 4, 2},
+            std::vector<ElemT>{1, 3, 5, 2, 4},
+            std::vector<IndexT>{0, 1, 2, 4, 5},
+            std::vector<IndexT>{0, 1, 2, 1, 3, 4, 3},
+            std::vector<CountT>{1, 2, 1, 2, 1},
+            true,
+            0,
+            false,
+        },
+        {
+            ov::Shape{7},
+            std::vector<ElemT>{1, 3, 5, 3, 2, 4, 2},
+            std::vector<ElemT>{1, 2, 3, 4, 5},
+            std::vector<IndexT>{0, 4, 1, 5, 2},
+            std::vector<IndexT>{0, 2, 4, 2, 1, 3, 1},
+            std::vector<CountT>{1, 2, 2, 1, 1},
+            true,
+            0,
+            true,
+        },
+        {
+            ov::Shape{7},
+            std::vector<ElemT>{3, 1, 5, 3, 2, 4, 2},
+            std::vector<ElemT>{1, 2, 3, 4, 5},
+            std::vector<IndexT>{1, 4, 0, 5, 2},
+            std::vector<IndexT>{2, 0, 4, 2, 1, 3, 1},
+            std::vector<CountT>{1, 2, 2, 1, 1},
+            true,
+            0,
+            true,
+        },
+        {
+            ov::Shape{7},
+            std::vector<ElemT>{3, 3, 5, 3, 2, 4, 2},
+            std::vector<ElemT>{2, 3, 4, 5},
+            std::vector<IndexT>{4, 0, 5, 2},
+            std::vector<IndexT>{1, 1, 3, 1, 0, 2, 0},
+            std::vector<CountT>{2, 3, 1, 1},
+            true,
+            0,
+            true,
+        },
+        {
+            ov::Shape{7},
+            std::vector<ElemT>{1, 3, 5, 3, 2, 4, 2},
+            std::vector<ElemT>{1, 2, 3, 4, 5},
+            std::vector<IndexT>{0, 4, 1, 5, 2},
+            std::vector<IndexT>{0, 2, 4, 2, 1, 3, 1},
+            std::vector<CountT>{1, 2, 2, 1, 1},
+            false,
+            0,
+            true,
+        },
+        {
+            ov::Shape{2, 6},
+            std::vector<ElemT>{3, 5, 3, 2, 4, 2, 1, 2, 3, 4, 5, 6},
+            std::vector<ElemT>{3, 5, 2, 4, 1, 6},
+            std::vector<IndexT>{0, 1, 3, 4, 6, 11},
+            std::vector<IndexT>{0, 1, 0, 2, 3, 2, 4, 2, 0, 3, 1, 5},
+            std::vector<CountT>{3, 2, 3, 2, 1, 1},
+            true,
+            0,
+            false,
+        },
+        {
+            ov::Shape{2, 4},
+            std::vector<ElemT>{1, 2, 3, 4, 1, 2, 3, 5},
+            std::vector<ElemT>{1, 2, 3, 4, 1, 2, 3, 5},
+            std::vector<IndexT>{0, 1},
+            std::vector<IndexT>{0, 1},
+            std::vector<CountT>{1, 1},
+            false,
+            0,
+            false,
+        },
+        {
+            ov::Shape{2, 4},
+            std::vector<ElemT>{1, 2, 3, 4, 1, 2, 3, 5},
+            std::vector<ElemT>{1, 2, 3, 4, 1, 2, 3, 5},
+            std::vector<IndexT>{0, 1, 2, 3},
+            std::vector<IndexT>{0, 1, 2, 3},
+            std::vector<CountT>{1, 1, 1, 1},
+            false,
+            1,
+            false,
+        },
+        {
+            ov::Shape{2, 4},
+            std::vector<ElemT>{1, 2, 2, 4, 1, 2, 2, 5},
+            std::vector<ElemT>{1, 2, 4, 1, 2, 5},
+            std::vector<IndexT>{0, 1, 3},
+            std::vector<IndexT>{0, 1, 1, 2},
+            std::vector<CountT>{1, 2, 1},
+            false,
+            1,
+            false,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
+            std::vector<ElemT>{1, 2, 3, 4, 5, 6},
+            std::vector<IndexT>{0},
+            std::vector<IndexT>{0, 0},
+            std::vector<CountT>{2},
+            false,
+            0,
+            false,
+        },
+        {
+            ov::Shape{2, 3, 2},
+            std::vector<ElemT>{-3, -2, -5, 4, -3, 2, 3, -4, 1, 2, -1, 4},
+            std::vector<ElemT>{-3, -2, -5, 4, -3, 2, 3, -4, 1, 2, -1, 4},
+            std::vector<IndexT>{0, 1},
+            std::vector<IndexT>{0, 1},
+            std::vector<CountT>{1, 1},
+            false,
+            0,
+            true,
+        },
+        {
+            ov::Shape{2, 3, 2},
+            std::vector<ElemT>{-3, -2, -5, 4, -3, 2, 3, -4, 1, 2, -1, 4},
+            std::vector<ElemT>{-3, -2, -5, 4, -3, 2, 3, -4, 1, 2, -1, 4},
+            std::vector<IndexT>{0, 1},
+            std::vector<IndexT>{0, 1},
+            std::vector<CountT>{1, 1},
+            false,
+            0,
+            false,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{6, 5, 4, 6, 5, 4, 3, 2, 1, 3, 2, 1},
+            std::vector<ElemT>{6, 5, 4, 3, 2, 1},
+            std::vector<IndexT>{0},
+            std::vector<IndexT>{0, 0},
+            std::vector<CountT>{2},
+            false,
+            1,
+            false,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{-1, 2, -1, 5, -3, 5, 7, -8, 7, 4, 4, 4},
+            std::vector<ElemT>{-1, 2, 5, -3, 7, -8, 4, 4},
+            std::vector<IndexT>{0, 1},
+            std::vector<IndexT>{0, 1, 0},
+            std::vector<CountT>{2, 1},
+            false,
+            2,
+            false,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{-1, -1, 2, 5, 5, -3, 7, 7, -8, 4, 4, 4},
+            std::vector<ElemT>{-1, 2, 5, -3, 7, -8, 4, 4},
+            std::vector<IndexT>{0, 2},
+            std::vector<IndexT>{0, 0, 1},
+            std::vector<CountT>{2, 1},
+            false,
+            2,
+            false,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{2, -1, -1, -3, 5, 5, -8, 7, 7, 4, 4, 4},
+            std::vector<ElemT>{2, -1, -3, 5, -8, 7, 4, 4},
+            std::vector<IndexT>{0, 1},
+            std::vector<IndexT>{0, 1, 1},
+            std::vector<CountT>{1, 2},
+            false,
+            2,
+            false,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{2, -1, -1, -3, 5, 5, -8, 7, 7, 4, 4, 4},
+            std::vector<ElemT>{-1, 2, 5, -3, 7, -8, 4, 4},
+            std::vector<IndexT>{1, 0},
+            std::vector<IndexT>{1, 0, 0},
+            std::vector<CountT>{2, 1},
+            false,
+            2,
+            true,
+        },
+        {
+            ov::Shape{2, 2, 3},
+            std::vector<ElemT>{-1, -1, -1, 3, 2, 2, 6, 7, 7, 4, 4, 4},
+            std::vector<ElemT>{-1, -1, 2, 3, 7, 6, 4, 4},
+            std::vector<IndexT>{1, 0},
+            std::vector<IndexT>{1, 0, 0},
+            std::vector<CountT>{2, 1},
+            false,
+            2,
+            true,
+        },
+        {
+            ov::Shape{1, 3, 16},
+            std::vector<ElemT>{15,  -20, -11, 10, -21, 8,  -15, -10, 7,  20, -19, -14, -13, -16, -7,  -2,
+                               -17, -4,  21,  -6, 11,  8,  17,  6,   7,  20, -3,  2,   -13, -16, -23, 14,
+                               -1,  12,  5,   -6, 11,  -8, 1,   -10, 23, 20, -19, 18,  3,   -16, -7,  14},
+            std::vector<ElemT>{-23, -21, -20, -19, -17, -16, -15, -14, -13, -11, -10, -8, -7, -6, -4, -3, -2, -1,
+                               1,   2,   3,   5,   6,   7,   8,   10,  11,  12,  14,  15, 17, 18, 20, 21, 23},
+            std::vector<IndexT>{30, 4,  1,  10, 16, 13, 6, 11, 12, 2,  7,  37, 14, 19, 17, 26, 15, 32,
+                                38, 27, 44, 34, 23, 8,  5, 3,  20, 33, 31, 0,  22, 43, 9,  18, 40},
+            std::vector<IndexT>{29, 2,  9,  25, 1,  24, 6,  10, 23, 32, 3,  7,  8,  5, 12, 16,
+                                4,  14, 33, 13, 26, 24, 30, 22, 23, 32, 15, 19, 8,  5, 0,  28,
+                                17, 27, 21, 13, 26, 11, 18, 10, 34, 32, 3,  31, 20, 5, 12, 28},
+            std::vector<CountT>{1, 1, 1, 2, 1, 3, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 3, 1, 1},
+            true,
+            0,
+            true,
+        },
+    };
+}
+
+const std::vector<format::type> layout_formats = {format::bfyx, format::b_fs_yx_fsv16};
+
+#define INSTANTIATE_UNIQUE_TEST_SUITE(elem_type, index_type, count_type)                                               \
+    using unique_gpu_test_##elem_type##index_type##count_type = unique_gpu_test<elem_type, index_type, count_type>;    \
+    TEST_P(unique_gpu_test_##elem_type##index_type##count_type, test) {                                                \
+        ASSERT_NO_FATAL_FAILURE(test());                                                                               \
+    }                                                                                                                  \
+    INSTANTIATE_TEST_SUITE_P(smoke_unique_##elem_type##index_type##count_type,                                         \
+                             unique_gpu_test_##elem_type##index_type##count_type,                                      \
+                             testing::Combine(testing::ValuesIn(getUniqueParams<elem_type, index_type, count_type>()), \
+                                              testing::ValuesIn(layout_formats)),                                      \
+                             unique_gpu_test_##elem_type##index_type##count_type::PrintToStringParamName);
+
+INSTANTIATE_UNIQUE_TEST_SUITE(float, int64_t, int32_t);
+
+}  // namespace