[GPU] NonZero primitive (#12519)

2022-08-15 17:32:37 +04:00 · 2022-08-15 17:32:37 +04:00 · dd55f434c3
commit dd55f434c3
parent 03b7c1e69e
20 changed files with 1194 additions and 0 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@ -155,6 +155,7 @@ REGISTER_FACTORY(v3, EmbeddingBagOffsetsSum);
 REGISTER_FACTORY(v3, EmbeddingBagPackedSum);
 REGISTER_FACTORY(v3, EmbeddingSegmentsSum);
 REGISTER_FACTORY(v3, ExtractImagePatches);
+REGISTER_FACTORY(v3, NonZero);
 REGISTER_FACTORY(v3, ROIAlign);
 REGISTER_FACTORY(v3, ScatterUpdate);
 REGISTER_FACTORY(v3, ScatterElementsUpdate);
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/non_zero.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_zero.hpp
@ -0,0 +1,39 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "primitive.hpp"
+
+namespace cldnn {
+
+struct count_nonzero : public primitive_base<count_nonzero> {
+    CLDNN_DECLARE_PRIMITIVE(count_nonzero)
+
+    /// @brief Constructs count_nonzero primitive.
+    /// @param id This primitive id.
+    /// @param data Input data primitive id.
+    count_nonzero(const primitive_id& id,
+                  const primitive_id& data,
+                  const primitive_id& ext_prim_id = "",
+                  const padding& output_padding = padding())
+        : primitive_base(id, {data}, ext_prim_id, output_padding) {}
+};
+
+struct gather_nonzero : public primitive_base<gather_nonzero> {
+    CLDNN_DECLARE_PRIMITIVE(gather_nonzero)
+
+    /// @brief Constructs gather_nonzero primitive.
+    /// @param id This primitive id.
+    /// @param data Input data primitive id.
+    /// @param output_shape Output shape [rank of data, number of nonzero elements]
+    gather_nonzero(const primitive_id& id,
+                   const primitive_id& data,
+                   const primitive_id& output_shape,
+                   const primitive_id& ext_prim_id = "",
+                   const padding& output_padding = padding())
+        : primitive_base(id, {data, output_shape}, ext_prim_id, output_padding) {}
+};
+
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/non_zero.cpp
@ -0,0 +1,119 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "non_zero_inst.h"
+#include "primitive_base.hpp"
+#include "impls/implementation_map.hpp"
+#include "kernel_selector_helper.h"
+#include "non_zero/count_nonzero_kernel_ref.h"
+#include "non_zero/count_nonzero_kernel_selector.h"
+#include "non_zero/gather_nonzero_kernel_ref.h"
+#include "non_zero/gather_nonzero_kernel_selector.h"
+#include "intel_gpu/runtime/error_handler.hpp"
+
+using namespace cldnn;
+
+namespace cldnn {
+namespace ocl {
+
+struct count_nonzero_impl : typed_primitive_impl_ocl<count_nonzero> {
+    using parent = typed_primitive_impl_ocl<count_nonzero>;
+    using parent::parent;
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<count_nonzero_impl>(*this);
+    }
+
+    static primitive_impl* create(const count_nonzero_node& arg, const kernel_impl_params& impl_param) {
+        auto nonzero_params = get_default_params<kernel_selector::count_nonzero_params>(impl_param);
+        auto nonzero_optional_params =
+            get_default_optional_params<kernel_selector::count_nonzero_optional_params>(arg.get_program());
+
+        auto& kernel_selector = kernel_selector::count_nonzero_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params);
+
+        OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id());
+
+        auto count_nonzero = new count_nonzero_impl(arg, best_kernels[0]);
+
+        return count_nonzero;
+    }
+};
+
+struct gather_nonzero_impl : typed_primitive_impl_ocl<gather_nonzero> {
+    using parent = typed_primitive_impl_ocl<gather_nonzero>;
+    using parent::parent;
+
+    std::unique_ptr<primitive_impl> clone() const override {
+        return make_unique<gather_nonzero_impl>(*this);
+    }
+
+public:
+    static primitive_impl* create(const gather_nonzero_node& arg, const kernel_impl_params& impl_param) {
+        auto nonzero_params = get_default_params<kernel_selector::gather_nonzero_params>(impl_param);
+        auto nonzero_optional_params =
+            get_default_optional_params<kernel_selector::gather_nonzero_optional_params>(arg.get_program());
+
+        nonzero_params.inputs.push_back(convert_data_tensor(arg.input(1).get_output_layout()));
+
+        auto& kernel_selector = kernel_selector::gather_nonzero_kernel_selector::Instance();
+        auto best_kernels = kernel_selector.GetBestKernels(nonzero_params, nonzero_optional_params);
+
+        OPENVINO_ASSERT(!best_kernels.empty(), "Cannot find a proper kernel for ", arg.id());
+
+        auto gather_nonzero = new gather_nonzero_impl(arg, best_kernels[0]);
+
+        return gather_nonzero;
+    }
+};
+
+namespace detail {
+
+attach_count_nonzero_impl::attach_count_nonzero_impl() {
+    implementation_map<count_nonzero>::add(impl_types::ocl, count_nonzero_impl::create, {
+        std::make_tuple(data_types::f32, format::bfyx),
+        std::make_tuple(data_types::f16, format::bfyx),
+        std::make_tuple(data_types::i32, format::bfyx),
+        std::make_tuple(data_types::i8, format::bfyx),
+        std::make_tuple(data_types::u8, format::bfyx),
+
+        std::make_tuple(data_types::f32, format::bfzyx),
+        std::make_tuple(data_types::f16, format::bfzyx),
+        std::make_tuple(data_types::i32, format::bfzyx),
+        std::make_tuple(data_types::i8, format::bfzyx),
+        std::make_tuple(data_types::u8, format::bfzyx),
+
+        std::make_tuple(data_types::f32, format::bfwzyx),
+        std::make_tuple(data_types::f16, format::bfwzyx),
+        std::make_tuple(data_types::i32, format::bfwzyx),
+        std::make_tuple(data_types::i8, format::bfwzyx),
+        std::make_tuple(data_types::u8, format::bfwzyx),
+    });
+}
+
+attach_gather_nonzero_impl::attach_gather_nonzero_impl() {
+    implementation_map<gather_nonzero>::add(impl_types::ocl, gather_nonzero_impl::create, {
+        std::make_tuple(data_types::f32, format::bfyx),
+        std::make_tuple(data_types::f16, format::bfyx),
+        std::make_tuple(data_types::i32, format::bfyx),
+        std::make_tuple(data_types::i8, format::bfyx),
+        std::make_tuple(data_types::u8, format::bfyx),
+
+        std::make_tuple(data_types::f32, format::bfzyx),
+        std::make_tuple(data_types::f16, format::bfzyx),
+        std::make_tuple(data_types::i32, format::bfzyx),
+        std::make_tuple(data_types::i8, format::bfzyx),
+        std::make_tuple(data_types::u8, format::bfzyx),
+
+        std::make_tuple(data_types::f32, format::bfwzyx),
+        std::make_tuple(data_types::f16, format::bfwzyx),
+        std::make_tuple(data_types::i32, format::bfwzyx),
+        std::make_tuple(data_types::i8, format::bfwzyx),
+        std::make_tuple(data_types::u8, format::bfwzyx),
+    });
+}
+
+}  // namespace detail
+}  // namespace ocl
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp
@ -90,6 +90,8 @@ void register_implementations() {
    REGISTER_OCL(embedding_bag);
    REGISTER_OCL(extract_image_patches);
    REGISTER_OCL(convert_color);
+    REGISTER_OCL(count_nonzero);
+    REGISTER_OCL(gather_nonzero);
 }

 }  // namespace ocl
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp
@ -73,6 +73,7 @@
 #include "intel_gpu/primitives/space_to_batch.hpp"
 #include "intel_gpu/primitives/strided_slice.hpp"
 #include "intel_gpu/primitives/tile.hpp"
+#include "intel_gpu/primitives/non_zero.hpp"

 namespace cldnn {
 namespace ocl {
@ -166,6 +167,8 @@ REGISTER_OCL(cum_sum);
 REGISTER_OCL(embedding_bag);
 REGISTER_OCL(extract_image_patches);
 REGISTER_OCL(convert_color);
+REGISTER_OCL(count_nonzero);
+REGISTER_OCL(gather_nonzero);

 #undef REGISTER_OCL

--- a/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/non_zero_inst.h
@ -0,0 +1,81 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma once
+#include "intel_gpu/primitives/non_zero.hpp"
+#include "primitive_inst.h"
+#include "intel_gpu/runtime/error_handler.hpp"
+
+#include <string>
+#include <memory>
+
+namespace cldnn {
+
+template <>
+struct typed_program_node<count_nonzero> : public typed_program_node_base<count_nonzero> {
+    using parent = typed_program_node_base<count_nonzero>;
+    typed_program_node(const std::shared_ptr<count_nonzero> prim, program& prog) : parent(prim, prog) {
+        support_padding_all(true);
+    }
+public:
+    using parent::parent;
+
+    program_node& input() const {
+        OPENVINO_ASSERT(dependencies.size() == 1, "[GPU] Primitive ", id(), " has invalid number of depndencies");
+        return get_dependency(0);
+    }
+};
+
+using count_nonzero_node = typed_program_node<count_nonzero>;
+
+template <>
+class typed_primitive_inst<count_nonzero> : public typed_primitive_inst_base<count_nonzero> {
+    using parent = typed_primitive_inst_base<count_nonzero>;
+
+public:
+    static layout calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param);
+    static std::string to_string(count_nonzero_node const& node);
+
+    typed_primitive_inst(network& network, count_nonzero_node const& node);
+
+private:
+    void on_execute() override;
+};
+
+using count_nonzero_inst = typed_primitive_inst<count_nonzero>;
+
+template <>
+struct typed_program_node<gather_nonzero> : public typed_program_node_base<gather_nonzero> {
+    using parent = typed_program_node_base<gather_nonzero>;
+    typed_program_node(const std::shared_ptr<gather_nonzero> prim, program& prog) : parent(prim, prog) {
+        support_padding_all(true);
+    }
+
+public:
+    using parent::parent;
+
+    bool generates_dynamic_output() const override { return true; }
+
+    program_node& input(size_t index = 0) const {
+        OPENVINO_ASSERT(dependencies.size() == 2, "[GPU] Primitive ", id(), " has invalid number of depndencies");
+        return get_dependency(index);
+    }
+};
+
+using gather_nonzero_node = typed_program_node<gather_nonzero>;
+
+template <>
+class typed_primitive_inst<gather_nonzero> : public typed_primitive_inst_base<gather_nonzero> {
+    using parent = typed_primitive_inst_base<gather_nonzero>;
+
+public:
+    static layout calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param);
+    static std::string to_string(gather_nonzero_node const& node);
+
+    typed_primitive_inst(network& network, gather_nonzero_node const& node);
+};
+
+using gather_nonzero_inst = typed_primitive_inst<gather_nonzero>;
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/graph/non_zero.cpp
+++ b/src/plugins/intel_gpu/src/graph/non_zero.cpp
@ -0,0 +1,93 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "non_zero_inst.h"
+#include "primitive_type_base.h"
+#include "intel_gpu/runtime/memory.hpp"
+#include "intel_gpu/runtime/error_handler.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
+#include "json_object.h"
+#include <string>
+
+namespace cldnn {
+
+// -----------------------------------------------
+// count_nonzero
+// -----------------------------------------------
+primitive_type_id count_nonzero::type_id() {
+    static primitive_type_base<count_nonzero> instance;
+    return &instance;
+}
+
+layout count_nonzero_inst::calc_output_layout(count_nonzero_node const& node, kernel_impl_params const& impl_param) {
+    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
+           "Output data type forcing is not supported for count_nonzero_node!");
+    return layout{cldnn::data_types::i32, cldnn::format::bfyx, tensor{1, 1, 1, 4}};
+}
+
+std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite count_nonzero_info;
+    count_nonzero_info.add("input id", input.id());
+    count_nonzero_info.add("output shape", tensor{1, 1, 1, 4});
+
+    node_info->add("count_nonzero info", count_nonzero_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {}
+
+void count_nonzero_inst::on_execute() {
+    output_memory().fill(_network.get_stream(), 0);
+}
+
+// -----------------------------------------------
+// gather_nonzero
+// -----------------------------------------------
+primitive_type_id gather_nonzero::type_id() {
+    static primitive_type_base<gather_nonzero> instance;
+    return &instance;
+}
+
+layout gather_nonzero_inst::calc_output_layout(gather_nonzero_node const& node, kernel_impl_params const& impl_param) {
+    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
+           "Output data type forcing is not supported for gather_nonzero_node!");
+    if (impl_param.memory_deps.count(1)) {
+        auto out_size = read_vector<int64_t>(impl_param.memory_deps.at(1), impl_param.prog.get_stream());
+        ov::Shape output_shape(out_size.begin(), out_size.end());
+        ov::PartialShape output_pshape(output_shape);
+        return layout{output_pshape, cldnn::data_types::i32, cldnn::format::bfyx};
+    } else {
+        return layout{ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), 1, 1}), cldnn::data_types::i32, cldnn::format::bfyx};
+    }
+}
+
+std::string gather_nonzero_inst::to_string(gather_nonzero_node const& node) {
+    auto desc = node.get_primitive();
+    auto node_info = node.desc_to_json();
+    auto& input = node.input();
+
+    std::stringstream primitive_description;
+
+    json_composite gather_nonzero_info;
+    gather_nonzero_info.add("input id", input.id());
+    gather_nonzero_info.add("output layout", node.get_output_layout().to_string());
+
+    node_info->add("gather_nonzero info", gather_nonzero_info);
+    node_info->dump(primitive_description);
+
+    return primitive_description.str();
+}
+
+gather_nonzero_inst::typed_primitive_inst(network& network, gather_nonzero_node const& node) : parent(network, node, false) {}
+
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/kernel_selector/common/common_types.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/common/common_types.h
@ -32,6 +32,8 @@ enum class KernelType {
    SCALE,
    REORDER,
    RESHAPE,
+    COUNT_NONZERO,
+    GATHER_NONZERO,
    PERMUTE,
    CONCATENATION,
    RESAMPLE,
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.cpp
@ -0,0 +1,92 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "count_nonzero_kernel_ref.h"
+#include "kernel_selector_utils.h"
+#include <string>
+
+namespace kernel_selector {
+ParamsKey CountNonzeroKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::UINT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::UINT32);
+    k.EnableOutputDataType(Datatype::INT64);
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    return k;
+}
+
+KernelsData CountNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    assert(params.GetType() == KernelType::COUNT_NONZERO);
+
+    KernelData kd = KernelData::Default<count_nonzero_params>(params);
+    count_nonzero_params& newParams = *static_cast<count_nonzero_params*>(kd.params.get());
+
+    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
+    auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    const auto& in = newParams.inputs[0];
+    auto& kernel = kd.kernels[0];
+    const auto& in_dims = in.GetDims();
+
+    std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
+
+    if (in_dims.size() == 4) {
+        kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v};
+        dims_by_gws = {{Tensor::DataChannelName::X},
+                       {Tensor::DataChannelName::Y},
+                       {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+    } else if (in_dims.size() == 5) {
+        kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v};
+        dims_by_gws = {{Tensor::DataChannelName::X},
+                       {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
+                       {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+    } else {
+        kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v};
+        dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
+                       {Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
+                       {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+    }
+
+    kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global,
+                                                                   params.engineInfo,
+                                                                   newParams.inputs[0].GetLayout(),
+                                                                   newParams.outputs[0].GetLayout(),
+                                                                   dims_by_gws);
+
+    kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
+    kernel.params.arguments = GetArgsDesc(1, false, false);
+
+    return {kd};
+}
+
+KernelsPriority CountNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+}
+
+bool CountNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const {
+    if (!KernelBaseOpenCL::Validate(p, op))
+        return false;
+
+    const auto& rp = static_cast<const count_nonzero_params&>(p);
+
+    return Tensor::SimpleLayout(rp.inputs[0].GetLayout());
+}
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_ref.h
@ -0,0 +1,36 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// count_nonzero_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct count_nonzero_params : public base_params {
+    count_nonzero_params() : base_params(KernelType::COUNT_NONZERO) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// count_nonzero_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct count_nonzero_optional_params : optional_params {
+    count_nonzero_optional_params() : optional_params(KernelType::COUNT_NONZERO) {}
+};
+
+class CountNonzeroKernelRef : public KernelBaseOpenCL {
+public:
+    CountNonzeroKernelRef() : KernelBaseOpenCL("count_nonzero_ref") {}
+    virtual ~CountNonzeroKernelRef() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& p, const optional_params& op) const override;
+};
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.cpp
@ -0,0 +1,15 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "count_nonzero_kernel_selector.h"
+#include "count_nonzero_kernel_ref.h"
+
+namespace kernel_selector {
+
+count_nonzero_kernel_selector::count_nonzero_kernel_selector() { Attach<CountNonzeroKernelRef>(); }
+
+KernelsData count_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::COUNT_NONZERO);
+}
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/count_nonzero_kernel_selector.h
@ -0,0 +1,23 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class count_nonzero_kernel_selector : public kernel_selector_base {
+public:
+    static count_nonzero_kernel_selector& Instance() {
+        static count_nonzero_kernel_selector instance_;
+        return instance_;
+    }
+
+    count_nonzero_kernel_selector();
+
+    virtual ~count_nonzero_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.cpp
@ -0,0 +1,96 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gather_nonzero_kernel_ref.h"
+#include "kernel_selector_utils.h"
+#include <string>
+
+namespace kernel_selector {
+ParamsKey GatherNonzeroKernelRef::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableInputDataType(Datatype::INT32);
+    k.EnableInputDataType(Datatype::UINT32);
+    k.EnableInputDataType(Datatype::INT64);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::INT32);
+    k.EnableOutputDataType(Datatype::UINT32);
+    k.EnableOutputDataType(Datatype::INT64);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableInputLayout(DataLayout::bfzyx);
+    k.EnableOutputLayout(DataLayout::bfzyx);
+    k.EnableInputLayout(DataLayout::bfwzyx);
+    k.EnableOutputLayout(DataLayout::bfwzyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    return k;
+}
+
+KernelsData GatherNonzeroKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
+    assert(params.GetType() == KernelType::GATHER_NONZERO);
+
+    KernelData kd = KernelData::Default<gather_nonzero_params>(params);
+    gather_nonzero_params& newParams = *static_cast<gather_nonzero_params*>(kd.params.get());
+
+    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options);
+    auto cldnn_jit = MakeBaseParamsJitConstants(newParams);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    const auto& in = newParams.inputs[0];
+    auto& kernel = kd.kernels[0];
+    const auto& in_dims = in.GetDims();
+
+    std::vector<std::vector<Tensor::DataChannelName>> dims_by_gws;
+
+    if (in_dims.size() == 4) {
+        kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v, in_dims[2].v * in_dims[3].v};
+        dims_by_gws = {{Tensor::DataChannelName::X},
+                       {Tensor::DataChannelName::Y},
+                       {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+    } else if (in_dims.size() == 5) {
+        kernel.params.workGroups.global = {in_dims[0].v, in_dims[1].v * in_dims[2].v, in_dims[3].v * in_dims[4].v};
+        dims_by_gws = {{Tensor::DataChannelName::X},
+                       {Tensor::DataChannelName::Y, Tensor::DataChannelName::Z},
+                       {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+    } else {
+        kernel.params.workGroups.global = {in_dims[0].v * in_dims[1].v, in_dims[2].v * in_dims[3].v, in_dims[4].v * in_dims[5].v};
+        dims_by_gws = {{Tensor::DataChannelName::X, Tensor::DataChannelName::Y},
+                       {Tensor::DataChannelName::Z, Tensor::DataChannelName::W},
+                       {Tensor::DataChannelName::FEATURE, Tensor::DataChannelName::BATCH}};
+    }
+
+    kernel.params.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.params.workGroups.global,
+                                                                   params.engineInfo,
+                                                                   newParams.inputs[0].GetLayout(),
+                                                                   newParams.outputs[0].GetLayout(),
+                                                                   dims_by_gws);
+
+    kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
+    kernel.params.arguments = GetArgsDesc(2, false, false);
+
+    return {kd};
+}
+
+KernelsPriority GatherNonzeroKernelRef::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const {
+    return DONT_USE_IF_HAVE_SOMETHING_ELSE;
+}
+
+bool GatherNonzeroKernelRef::Validate(const Params& p, const optional_params& op) const {
+    if (!KernelBaseOpenCL::Validate(p, op))
+        return false;
+
+    const auto& rp = static_cast<const gather_nonzero_params&>(p);
+
+    return Tensor::SimpleLayout(rp.inputs[0].GetLayout());
+}
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_ref.h
@ -0,0 +1,36 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// gather_nonzero_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct gather_nonzero_params : public base_params {
+    gather_nonzero_params() : base_params(KernelType::GATHER_NONZERO) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// gather_nonzero_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct gather_nonzero_optional_params : optional_params {
+    gather_nonzero_optional_params() : optional_params(KernelType::GATHER_NONZERO) {}
+};
+
+class GatherNonzeroKernelRef : public KernelBaseOpenCL {
+public:
+    GatherNonzeroKernelRef() : KernelBaseOpenCL("gather_nonzero_ref") {}
+    virtual ~GatherNonzeroKernelRef() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& p, const optional_params& op) const override;
+};
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.cpp
@ -0,0 +1,15 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gather_nonzero_kernel_selector.h"
+#include "gather_nonzero_kernel_ref.h"
+
+namespace kernel_selector {
+
+gather_nonzero_kernel_selector::gather_nonzero_kernel_selector() { Attach<GatherNonzeroKernelRef>(); }
+
+KernelsData gather_nonzero_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
+    return GetNaiveBestKernel(params, options, KernelType::GATHER_NONZERO);
+}
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/non_zero/gather_nonzero_kernel_selector.h
@ -0,0 +1,23 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_selector.h"
+
+namespace kernel_selector {
+class gather_nonzero_kernel_selector : public kernel_selector_base {
+public:
+    static gather_nonzero_kernel_selector& Instance() {
+        static gather_nonzero_kernel_selector instance_;
+        return instance_;
+    }
+
+    gather_nonzero_kernel_selector();
+
+    virtual ~gather_nonzero_kernel_selector() {}
+
+    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
+};
+}  // namespace kernel_selector
--- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/count_nonzero_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/count_nonzero_ref.cl
@ -0,0 +1,51 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/data_types.cl"
+
+#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
+
+KERNEL (count_nonzero_ref)(const __global INPUT0_TYPE* input,
+                           volatile __global OUTPUT_TYPE* output)
+{
+    const uint gdim0 = (uint)get_global_id(0);
+    const uint gdim1 = (uint)get_global_id(1);
+    const uint gdim2 = (uint)get_global_id(2);
+
+    #if INPUT0_DIMS == 6
+        #define INPUT_ORDER b,f,w,z,y,x
+        const uint x = gdim0 % INPUT0_SIZE_X;
+        const uint y = gdim0 / INPUT0_SIZE_X;
+        const uint z = gdim1 % INPUT0_SIZE_Z;
+        const uint w = gdim1 / INPUT0_SIZE_Z;
+    #elif INPUT0_DIMS == 5
+        #define INPUT_ORDER b,f,z,y,x
+        const uint x = gdim0;
+        const uint y = gdim1 % INPUT0_SIZE_Y;
+        const uint z = gdim1 / INPUT0_SIZE_Y;
+    #elif INPUT0_DIMS == 4
+        #define INPUT_ORDER b,f,y,x
+        const uint x = gdim0;
+        const uint y = gdim1;
+    #endif
+
+    const uint f = gdim2 % INPUT0_FEATURE_NUM;
+    const uint b = gdim2 / INPUT0_FEATURE_NUM;
+
+    uint count = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1;
+    count = sub_group_reduce_add(count);
+
+    if (get_sub_group_local_id() == 0)
+        atomic_add(&(output[1]), count);
+
+    if (gdim0 == 0 && gdim1 == 0 && gdim2 == 0) {
+        output[0] = INPUT0_DIMS;
+        output[2] = 1;
+        output[3] = 1;
+    }
+}
+
+#undef INPUT0_GET_INDEX1
+#undef INPUT_ORDER
--- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/gather_nonzero_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/gather_nonzero_ref.cl
@ -0,0 +1,87 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/data_types.cl"
+
+#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
+
+KERNEL (gather_nonzero_ref)(const __global INPUT0_TYPE* input,
+                            volatile __global INPUT1_TYPE* output_shape,
+                            __global OUTPUT_TYPE* output)
+{
+    const uint gdim0 = (uint)get_global_id(0);
+    const uint gdim1 = (uint)get_global_id(1);
+    const uint gdim2 = (uint)get_global_id(2);
+
+    #if INPUT0_DIMS == 6
+        #define INPUT_ORDER b,f,w,z,y,x
+        const uint x = gdim0 % INPUT0_SIZE_X;
+        const uint y = gdim0 / INPUT0_SIZE_X;
+        const uint z = gdim1 % INPUT0_SIZE_Z;
+        const uint w = gdim1 / INPUT0_SIZE_Z;
+    #elif INPUT0_DIMS == 5
+        #define INPUT_ORDER b,f,z,y,x
+        const uint x = gdim0;
+        const uint y = gdim1 % INPUT0_SIZE_Y;
+        const uint z = gdim1 / INPUT0_SIZE_Y;
+    #elif INPUT0_DIMS == 4
+        #define INPUT_ORDER b,f,y,x
+        const uint x = gdim0;
+        const uint y = gdim1;
+    #endif
+
+    const uint f = gdim2 % INPUT0_FEATURE_NUM;
+    const uint b = gdim2 / INPUT0_FEATURE_NUM;
+
+    int num_nonzero_acc = (input[INPUT0_GET_INDEX1(INPUT_ORDER)] == INPUT0_VAL_ZERO) ? 0 : 1;
+    num_nonzero_acc = sub_group_scan_inclusive_add(num_nonzero_acc);
+    
+    int pos;
+
+    if (get_sub_group_local_id() == (get_sub_group_size() - 1)) {
+        pos = atomic_add(&(output_shape[2]), num_nonzero_acc);
+        pos = pos - 1;
+    }
+
+    pos = sub_group_broadcast(pos, (get_sub_group_size() - 1));
+  
+    // output_shape = {rank, # nonzero, 1, 1}
+    if (input[INPUT0_GET_INDEX1(INPUT_ORDER)] != INPUT0_VAL_ZERO) {
+        const int num_nonzero = output_shape[1];
+
+        pos = pos + num_nonzero_acc - 1;
+
+        int output_b = pos;
+        int output_f = pos + num_nonzero;
+
+        output[output_b] = b;
+        output[output_f] = f;
+
+        #if INPUT0_DIMS == 6
+            int output_w = pos + num_nonzero * 2;
+            int output_z = pos + num_nonzero * 3;
+            int output_y = pos + num_nonzero * 4;
+            int output_x = pos + num_nonzero * 5;
+
+            output[output_w] = w;
+            output[output_z] = z;
+        #elif INPUT0_DIMS == 5
+            int output_z = pos + num_nonzero * 2;
+            int output_y = pos + num_nonzero * 3;
+            int output_x = pos + num_nonzero * 4;
+
+            output[output_z] = z;
+        #elif INPUT0_DIMS == 4
+            int output_y = pos + num_nonzero * 2;
+            int output_x = pos + num_nonzero * 3;
+        #endif
+
+        output[output_y] = y;
+        output[output_x] = x;
+    }
+}
+
+#undef INPUT0_GET_INDEX1
+#undef INPUT_ORDER
--- a/src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/non_zero.cpp
@ -0,0 +1,38 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/plugin/program.hpp"
+#include "intel_gpu/plugin/common_utils.hpp"
+
+#include "ngraph/op/non_zero.hpp"
+
+#include "intel_gpu/primitives/non_zero.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+static void CreateNonZeroOp(Program& p, const std::shared_ptr<ngraph::Node>& op) {
+    p.ValidateInputs(op, {1});
+    auto input_primitives = p.GetInputPrimitiveIDs(op);
+    std::string layer_name = layer_type_name_ID(op);
+
+    cldnn::primitive_id count_prim_id = layer_name + "_count";
+    auto count_prim = cldnn::count_nonzero(count_prim_id,
+                                           input_primitives[0],
+                                           op->get_friendly_name());
+
+    auto gather_prim = cldnn::gather_nonzero(layer_name,
+                                             input_primitives[0],
+                                             count_prim_id,
+                                             op->get_friendly_name());
+
+    p.AddPrimitive(count_prim);
+    p.AddPrimitive(gather_prim);
+    p.AddPrimitiveToProfiler(op);
+}
+
+REGISTER_FACTORY_IMPL(v3, NonZero);
+
+}  // namespace intel_gpu
+}  // namespace ov
--- a/src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/non_zero_gpu_test.cpp
@ -0,0 +1,342 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+
+#include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/non_zero.hpp>
+#include <intel_gpu/runtime/memory.hpp>
+#include <intel_gpu/graph/topology.hpp>
+#include <intel_gpu/graph/network.hpp>
+
+#include <cstddef>
+
+using namespace cldnn;
+using namespace ::tests;
+
+inline void do_count_non_zero_test(engine& engine,
+                                   const cldnn::memory::ptr& input_data,
+                                   const std::vector<int32_t>& expected_results)
+{
+    topology topology;
+    topology.add(input_layout("InputData", input_data->get_layout()));
+    topology.add(
+        count_nonzero("count_nonzero", "InputData")
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("InputData", input_data);
+    auto outputs = network.execute();
+    auto output = outputs.at("count_nonzero").get_memory();
+    cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    }
+}
+
+inline void do_gather_non_zero_test(engine& engine,
+                                    const cldnn::memory::ptr& input_data,
+                                    const cldnn::memory::ptr& output_shape,
+                                    const std::vector<int32_t>& expected_results)
+{
+    topology topology;
+    topology.add(input_layout("InputData", input_data->get_layout()));
+    topology.add(input_layout("OutputShape", output_shape->get_layout()));
+    topology.add(
+        gather_nonzero("gather_nonzero", "InputData", "OutputShape")
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("InputData", input_data);
+    network.set_input_data("OutputShape", output_shape);
+    auto outputs = network.execute();
+    auto output = outputs.at("gather_nonzero").get_memory();
+    cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
+    cldnn::mem_lock<int32_t> shape_ptr(output_shape, get_test_stream());
+
+    int num_ranks = shape_ptr[0];
+    int num_nonzero = shape_ptr[1];
+
+    for (int i = 0; i < num_nonzero; i++) {
+        bool found = false;
+        for (int j = 0; j < num_nonzero; j++) {
+            for (int k = 0; k < num_ranks; k++) {
+                if (output_ptr[i+num_nonzero*k] != expected_results[j+num_nonzero*k])
+                    break;
+
+                if (k == (num_ranks - 1)) {
+                    found = true;
+                }
+            }
+            if (found)
+                break;
+        }
+
+        EXPECT_TRUE(found);
+    }
+}
+
+inline void do_non_zero_test(engine& engine,
+                          const cldnn::memory::ptr& input_data,
+                          const std::vector<int32_t>& expected_shape,
+                          const std::vector<int32_t>& expected_results)
+{
+    topology topology;
+    topology.add(input_layout("InputData", input_data->get_layout()));
+    topology.add(
+        count_nonzero("count_nonzero", "InputData")
+    );
+    topology.add(
+        gather_nonzero("gather_nonzero", "InputData", "count_nonzero")
+    );
+
+    network network(engine, topology);
+
+    network.set_input_data("InputData", input_data);
+    auto outputs = network.execute();
+    auto output = outputs.at("gather_nonzero").get_memory();
+    cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
+    std::vector<int32_t> output_list = std::vector<int32_t>(output_ptr.begin(), output_ptr.end());
+
+    int num_ranks = expected_shape[0];
+    int num_nonzero = expected_shape[1];
+
+    EXPECT_EQ(num_ranks*num_nonzero, output_list.size());
+
+    for (int i = 0; i < num_nonzero; i++) {
+        bool found = false;
+        for (int j = 0; j < num_nonzero; j++) {
+            for (int k = 0; k < num_ranks; k++) {
+                if (output_list[i+num_nonzero*k] != expected_results[j+num_nonzero*k])
+                    break;
+
+                if (k == (num_ranks - 1)) {
+                    found = true;
+                }
+            }
+            if (found)
+                break;
+        }
+
+        EXPECT_TRUE(found);
+    }
+}
+
+TEST(count_nonzero_gpu_fp16, test1) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
+
+    set_values(input, {
+        FLOAT16(0), FLOAT16(1), FLOAT16(8),
+        FLOAT16(5), FLOAT16(5), FLOAT16(2),
+        FLOAT16(7), FLOAT16(10), FLOAT16(4),
+    });
+
+    std::vector<int32_t> expected_results = {
+        4,  8,  1,  1,
+    };
+
+    do_count_non_zero_test(engine, input, expected_results);
+}
+
+TEST(gather_nonzero_gpu_fp16, test1) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
+    auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } });
+
+    set_values(input, {
+        FLOAT16(0), FLOAT16(1), FLOAT16(8),
+        FLOAT16(5), FLOAT16(5), FLOAT16(2),
+        FLOAT16(7), FLOAT16(10), FLOAT16(4),
+    });
+
+    set_values(output_shape, {
+        4,  8,  1,  1,
+    });
+
+    std::vector<int32_t> expected_results = {
+        0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  1,  1,  1,  2,  2,  2,
+        0,  0,  0,  0,  0,  0,  0,  0,
+        1,  2,  0,  1,  2,  0,  1,  2,
+    };
+
+    do_gather_non_zero_test(engine, input, output_shape, expected_results);
+}
+
+TEST(gather_nonzero_gpu_fp16, test2) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
+    auto output_shape = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 1, 1, 4, 1 } });
+
+    set_values(input, {
+        FLOAT16(0), FLOAT16(1), FLOAT16(8),
+        FLOAT16(5), FLOAT16(5), FLOAT16(0),
+        FLOAT16(7), FLOAT16(0), FLOAT16(4),
+    });
+
+    set_values(output_shape, {
+        4,  6,  1,  1,
+    });
+
+    std::vector<int32_t> expected_results = {
+        0,  0,  0,  0,  0,  0,
+        0,  0,  1,  1,  2,  2,
+        0,  0,  0,  0,  0,  0,
+        1,  2,  0,  1,  0,  2,
+    };
+
+    do_gather_non_zero_test(engine, input, output_shape, expected_results);
+}
+
+TEST(nonzero_gpu_fp16, test1) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f16, format::bfyx, tensor{ 1, 3, 3, 1 } });
+
+    set_values(input, {
+        FLOAT16(0), FLOAT16(1), FLOAT16(8),
+        FLOAT16(5), FLOAT16(5), FLOAT16(0),
+        FLOAT16(7), FLOAT16(0), FLOAT16(4),
+    });
+
+    std::vector<int32_t> expected_shape = {
+        4,  6,  1, 1,
+    };
+
+    std::vector<int32_t> expected_results = {
+        0,  0,  0,  0,  0,  0,
+        0,  0,  1,  1,  2,  2,
+        0,  0,  0,  0,  0,  0,
+        1,  2,  0,  1,  0,  2,
+    };
+
+    do_non_zero_test(engine, input, expected_shape, expected_results);
+}
+
+TEST(nonzero_gpu_fp16, test2) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f16, format::bfzyx, tensor{ 1, 3, 3, 3, 1 } });
+
+    set_values(input, {
+        FLOAT16(0), FLOAT16(1), FLOAT16(8),
+        FLOAT16(7), FLOAT16(10), FLOAT16(4),
+        FLOAT16(7), FLOAT16(0), FLOAT16(4),
+        FLOAT16(9), FLOAT16(5), FLOAT16(1),
+        FLOAT16(2), FLOAT16(0), FLOAT16(8),
+        FLOAT16(2), FLOAT16(10), FLOAT16(7),
+        FLOAT16(2), FLOAT16(4), FLOAT16(8),
+        FLOAT16(5), FLOAT16(9), FLOAT16(10),
+        FLOAT16(10), FLOAT16(5), FLOAT16(2),
+    });
+
+    std::vector<int32_t> expected_shape = {
+        5,  24,  1, 1,
+    };
+
+    std::vector<int32_t> expected_results = {
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  1,  1,  1,  2,  2,  0,  0,  0,  1,  1,  2,  2,  2,  0,  0,  0,  1,  1,  1,  2,  2,  2,
+        1,  2,  0,  1,  2,  0,  2,  0,  1,  2,  0,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,
+    };
+
+    do_non_zero_test(engine, input, expected_shape, expected_results);
+}
+
+TEST(nonzero_gpu_fp16, test3) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f16, format::bfwzyx, tensor{ 1, 3, 3, 3, 3, 1 } });
+
+    set_values(input, {
+        FLOAT16(0), FLOAT16(1),  FLOAT16(8), FLOAT16(7), FLOAT16(10), FLOAT16(4),  FLOAT16(6),  FLOAT16(5), FLOAT16(4),
+        FLOAT16(7), FLOAT16(0),  FLOAT16(4), FLOAT16(9), FLOAT16(5),  FLOAT16(1),  FLOAT16(2),  FLOAT16(2), FLOAT16(0),
+        FLOAT16(2), FLOAT16(0),  FLOAT16(8), FLOAT16(2), FLOAT16(10), FLOAT16(7),  FLOAT16(7),  FLOAT16(0), FLOAT16(6),
+        FLOAT16(2), FLOAT16(4),  FLOAT16(8), FLOAT16(5), FLOAT16(9),  FLOAT16(10), FLOAT16(10), FLOAT16(5), FLOAT16(2),
+        FLOAT16(4), FLOAT16(8),  FLOAT16(2), FLOAT16(1), FLOAT16(4),  FLOAT16(10), FLOAT16(10), FLOAT16(2), FLOAT16(21),
+        FLOAT16(0), FLOAT16(1),  FLOAT16(5), FLOAT16(1), FLOAT16(5),  FLOAT16(1),  FLOAT16(9),  FLOAT16(4), FLOAT16(22),
+        FLOAT16(4), FLOAT16(3),  FLOAT16(7), FLOAT16(6), FLOAT16(9),  FLOAT16(8),  FLOAT16(9),  FLOAT16(7), FLOAT16(23),
+        FLOAT16(4), FLOAT16(10), FLOAT16(6), FLOAT16(3), FLOAT16(5),  FLOAT16(5),  FLOAT16(4),  FLOAT16(2), FLOAT16(23),
+        FLOAT16(0), FLOAT16(4),  FLOAT16(5), FLOAT16(3), FLOAT16(1),  FLOAT16(2),  FLOAT16(8),  FLOAT16(5), FLOAT16(0),
+    });
+
+    std::vector<int32_t> expected_shape = {
+        6,  73,  1, 1,
+    };
+
+    std::vector<int32_t> expected_results = {
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,
+        0,  0,  1,  1,  1,  2,  2,  2,  0,  0,  1,  1,  1,  2,  2,  0,  0,  1,  1,  1,  2,  2,  0,  0,  0,  1,  1,  1,  2,  2,  2,  0,  0,  0,  1,  1,  1,  2,  2,  2,  0,  0,  1,  1,  1,  2,  2,  2,  0,  0,  0,  1,  1,  1,  2,  2,  2,  0,  0,  0,  1,  1,  1,  2,  2,  2,  0,  0,  1,  1,  1,  2,  2,
+        1,  2,  0,  1,  2,  0,  1,  2,  0,  2,  0,  1,  2,  0,  1,  0,  2,  0,  1,  2,  0,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  1,  2,  0,  1,  2,  0,  1,
+    };
+
+    do_non_zero_test(engine, input, expected_shape, expected_results);
+}
+
+TEST(nonzero_gpu_fp32, test1) {
+    auto& engine = get_test_engine();
+
+    auto input = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 2, 3, 5, 4 } });
+
+    set_values(input, {
+        6, 6, 0, 3, 0,  4, 1, 0, 8, 4,  8, 5, 8, 6, 0,  2, 0, 9, 6, 9,
+        1, 2, 4, 9, 0,  8, 5, 7, 4, 6,  8, 0, 6, 2, 3,  5, 0, 9, 8, 7,
+        3, 6, 5, 3, 8,  4, 7, 5, 7, 8,  5, 2, 1, 8, 9,  2, 1, 4, 3, 3,
+
+        7, 3, 9, 9, 0,  2, 4, 0, 4, 9,  5, 9, 4, 5, 8,  1, 2, 9, 7, 6,
+        7, 9, 6, 7, 2,  9, 2, 7, 8, 3,  1, 2, 7, 4, 6,  2, 3, 7, 0, 5,
+        2, 3, 7, 7, 0,  3, 4, 0, 9, 0,  9, 0, 2, 7, 7,  8, 6, 6, 0, 8,
+    });
+
+    std::vector<int32_t> expected_shape = {
+        4, 104, 1, 1,
+    };
+
+    std::vector<int32_t> expected_results = {
+        0,  0,      0,      0,  0,      0,  0,  0,  0,  0,  0,      0,      0,  0,  0,
+        0,  0,  0,  0,      0,  0,  0,  0,  0,  0,      0,  0,  0,  0,      0,  0,  0,
+        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        1,  1,  1,  1,      1,  1,      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,      1,
+        1,  1,  1,  1,      1,  1,      1,      1,      1,  1,  1,  1,  1,  1,      1,
+
+
+        0,  0,      0,      0,  0,      0,  0,  0,  0,  0,  0,      0,      0,  0,  0,
+        1,  1,  1,  1,      1,  1,  1,  1,  1,  1,      1,  1,  1,  1,      1,  1,  1,
+        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+        0,  0,  0,  0,      0,  0,      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,      1,
+        2,  2,  2,  2,      2,  2,      2,      2,      2,  2,  2,  2,  2,  2,      2,
+
+
+        0,  0,      0,      1,  1,      1,  1,  2,  2,  2,  2,      3,      3,  3,  3,
+        0,  0,  0,  0,      1,  1,  1,  1,  1,  2,      2,  2,  2,  3,      3,  3,  3,
+        0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
+        0,  0,  0,  0,      1,  1,      1,  1,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,
+        0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,  3,      3,
+        0,  0,  0,  0,      1,  1,      1,      2,      2,  2,  2,  3,  3,  3,      3,
+
+
+        0,  1,      3,      0,  1,      3,  4,  0,  1,  2,  3,      0,      2,  3,  4,
+        0,  1,  2,  3,      0,  1,  2,  3,  4,  0,      2,  3,  4,  0,      2,  3,  4,
+        0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,
+        0,  1,  2,  3,      0,  1,      3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,
+        0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,      4,
+        0,  1,  2,  3,      0,  1,      3,      0,      2,  3,  4,  0,  1,  2,      4,
+    };
+
+    do_non_zero_test(engine, input, expected_shape, expected_results);
+}